Skip to content

Commit

Permalink
Fix ocr based configs and add LoRRA configs for VizWiz and VQA2
Browse files Browse the repository at this point in the history
  • Loading branch information
apsdehal committed Apr 26, 2019
1 parent 87fede2 commit 099146d
Show file tree
Hide file tree
Showing 6 changed files with 276 additions and 15 deletions.
12 changes: 11 additions & 1 deletion configs/vqa/textvqa/lorra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,20 @@ task_attributes:
vqa:
dataset_attributes:
textvqa:
use_ocr: true
processors:
answer_processor:
type: soft_copy_answer
params:
use_soft_copy: true
vocab_file: vocabs/answers_textvqa_more_than_1.txt
preprocessor:
type: simple_word
params: {}
context_preprocessor:
type: simple_word
params: {}
max_length: 50
num_answers: 10
model_attributes:
lorra: &lorra
model_data_dir: ../data
Expand Down
120 changes: 120 additions & 0 deletions configs/vqa/vizwiz/lorra.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
includes:
- common/defaults/configs/tasks/vqa/vizwiz.yml
# Use soft copy
task_attributes:
vqa:
dataset_attributes:
vizwiz:
use_ocr: true
processors:
answer_processor:
type: soft_copy_answer
params:
vocab_file: vocabs/answers_vizwiz_7k.txt
preprocessor:
type: simple_word
params: {}
context_preprocessor:
type: simple_word
params: {}
max_length: 50
num_answers: 10
model_attributes:
lorra: &lorra
model_data_dir: ../data
metrics:
- type: vqa_accuracy
losses:
- type: logit_bce
num_context_features: 1
context_feature_dim: 300
image_feature_dim: 2048
context_max_len: 50
classifier:
type: logit
params:
img_hidden_dim: 5000
text_hidden_dim: 300
image_feature_embeddings:
- modal_combine:
type: non_linear_element_multiply
params:
dropout: 0
hidden_dim: 5000
normalization: softmax
transform:
type: linear
params:
out_dim: 1
context_feature_embeddings:
- modal_combine:
type: non_linear_element_multiply
params:
dropout: 0
hidden_dim: 5000
normalization: sigmoid
transform:
type: linear
params:
out_dim: 1
image_feature_encodings:
- type: finetune_faster_rcnn_fpn_fc7
params:
bias_file: detectron/fc6/fc7_b.pkl
weights_file: detectron/fc6/fc7_w.pkl
- type: default
params: {}
context_feature_encodings:
- type: default
params: {}
image_text_modal_combine:
type: non_linear_element_multiply
params:
dropout: 0
hidden_dim: 5000
# 300 for FastText and 50 for order vectors
context_dim: 350
text_embeddings:
- type: attention
params:
hidden_dim: 1024
num_layers: 1
conv1_out: 512
conv2_out: 2
dropout: 0
embedding_dim: 300
kernel_size: 1
padding: 0
context_embeddings:
- type: identity
params:
embedding_dim: 350
lorra_with_glove: *lorra
optimizer_attributes:
params:
eps: 1.0e-08
lr: 0.01
weight_decay: 0
type: Adamax
training_parameters:
clip_norm_mode: all
clip_gradients: true
max_grad_l2_norm: 0.25
lr_scheduler: true
lr_steps:
- 14000
lr_ratio: 0.01
use_warmup: true
warmup_factor: 0.2
warmup_iterations: 1000
max_iterations: 24000
batch_size: 128
num_workers: 7
task_size_proportional_sampling: true
monitored_metric: vqa_accuracy
pretrained_mapping:
text_embeddings: text_embeddings
image_feature_encoders: image_feature_encoders
image_feature_embeddings_list: image_feature_embeddings_list
image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
metric_minimize: false
125 changes: 125 additions & 0 deletions configs/vqa/vqa2/lorra.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
includes:
- common/defaults/configs/tasks/vqa/vqa2.yml
# Use soft copy
task_attributes:
vqa:
dataset_attributes:
vqa2:
use_ocr: true
processors:
answer_processor:
type: soft_copy_answer
params:
vocab_file: vocabs/answers_vqa.txt
preprocessor:
type: simple_word
params: {}
context_preprocessor:
type: simple_word
params: {}
max_length: 50
num_answers: 10
model_attributes:
lorra: &lorra
model_data_dir: ../data
metrics:
- type: vqa_accuracy
losses:
- type: logit_bce
num_context_features: 1
context_feature_dim: 300
image_feature_dim: 2048
context_max_len: 50
classifier:
type: logit
params:
img_hidden_dim: 5000
text_hidden_dim: 300
image_feature_embeddings:
- modal_combine:
type: non_linear_element_multiply
params:
dropout: 0
hidden_dim: 5000
normalization: softmax
transform:
type: linear
params:
out_dim: 1
context_feature_embeddings:
- modal_combine:
type: non_linear_element_multiply
params:
dropout: 0
hidden_dim: 5000
normalization: sigmoid
transform:
type: linear
params:
out_dim: 1
image_feature_encodings:
- type: finetune_faster_rcnn_fpn_fc7
params:
bias_file: detectron/fc6/fc7_b.pkl
weights_file: detectron/fc6/fc7_w.pkl
- type: default
params: {}
context_feature_encodings:
- type: default
params: {}
image_text_modal_combine:
type: non_linear_element_multiply
params:
dropout: 0
hidden_dim: 5000
# 300 for FastText and 50 for order vectors
context_dim: 350
text_embeddings:
- type: attention
params:
hidden_dim: 1024
num_layers: 1
conv1_out: 512
conv2_out: 2
dropout: 0
embedding_dim: 300
kernel_size: 1
padding: 0
context_embeddings:
- type: identity
params:
embedding_dim: 350
lorra_with_glove: *lorra
optimizer_attributes:
type: Adamax
params:
eps: 1.0e-08
lr: 0.01
weight_decay: 0
training_parameters:
clip_norm_mode: all
clip_gradients: true
lr_ratio: 0.1
lr_scheduler: true
lr_steps:
- 15000
- 18000
- 20000
- 21000
max_grad_l2_norm: 0.25
max_iterations: 22000
use_warmup: true
warmup_factor: 0.2
warmup_iterations: 1000
patience: 4000
batch_size: 512
num_workers: 7
task_size_proportional_sampling: true
monitored_metric: vqa_accuracy
pretrained_mapping:
text_embeddings: text_embeddings
context_embeddings: context_embeddings
image_feature_encoders: image_feature_encoders
image_feature_embeddings_list: image_feature_embeddings_list
image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
metric_minimize: false
9 changes: 2 additions & 7 deletions pythia/common/defaults/configs/tasks/vqa/textvqa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,13 @@ task_attributes:
embedding_name: glove.6B.300d
vocab_file: vocabs/vocabulary_100k.txt
answer_processor:
type: soft_copy_answer
type: vqa_answer
params:
vocab_file: vocabs/answers_textvqa_more_than_1.txt
vocab_file: vocabs/answers_textvqa_8k.txt
preprocessor:
type: simple_word
params: {}
context_preprocessor:
type: simple_word
params: {}
max_length: 50
num_answers: 10
use_soft_copy: false
context_processor:
type: fasttext
params:
Expand Down
9 changes: 2 additions & 7 deletions pythia/common/defaults/configs/tasks/vqa/vizwiz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,13 @@ task_attributes:
embedding_name: glove.6B.300d
vocab_file: vocabs/vocabulary_100k.txt
answer_processor:
type: soft_copy_answer
type: vqa_answer
params:
vocab_file: vocabs/answers_vizwiz_7k.txt
preprocessor:
type: simple_word
params: {}
context_preprocessor:
type: simple_word
params: {}
max_length: 50
num_answers: 10
use_soft_copy: false
context_processor:
type: fasttext
params:
Expand All @@ -59,7 +54,7 @@ task_attributes:
max_length: 50
return_info: true
# Return OCR information
use_ocr: true
use_ocr: false
# Return spatial information of OCR tokens if present
use_ocr_info: false
training_parameters:
Expand Down
16 changes: 16 additions & 0 deletions pythia/common/defaults/configs/tasks/vqa/vqa2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,23 @@ task_attributes:
preprocessor:
type: simple_word
params: {}
context_processor:
type: fasttext
params:
max_length: 50
model_file: .vector_cache/wiki.en.bin
ocr_token_processor:
type: simple_word
params: {}
bbox_processor:
type: bbox
params:
max_length: 50
return_info: true
# Return OCR information
use_ocr: false
# Return spatial information of OCR tokens if present
use_ocr_info: false
training_parameters:
monitored_metric: vqa2_vqa_accuracy
metric_minimize: false

0 comments on commit 099146d

Please sign in to comment.