Fix ocr based configs and add LoRRA configs for VizWiz and VQA2

J-BING · Apr 26, 2019 · 099146d · 099146d
1 parent 87fede2
commit 099146d
Show file tree

Hide file tree

Showing 6 changed files with 276 additions and 15 deletions.
diff --git a/configs/vqa/textvqa/lorra.yml b/configs/vqa/textvqa/lorra.yml
@@ -5,10 +5,20 @@ task_attributes:
   vqa:
     dataset_attributes:
       textvqa:
+        use_ocr: true
         processors:
           answer_processor:
+            type: soft_copy_answer
             params:
-              use_soft_copy: true
+              vocab_file: vocabs/answers_textvqa_more_than_1.txt
+              preprocessor:
+                type: simple_word
+                params: {}
+              context_preprocessor:
+                type: simple_word
+                params: {}
+              max_length: 50
+              num_answers: 10
 model_attributes:
   lorra: &lorra
     model_data_dir: ../data

diff --git a/configs/vqa/vizwiz/lorra.yml b/configs/vqa/vizwiz/lorra.yml
@@ -0,0 +1,120 @@
+includes:
+- common/defaults/configs/tasks/vqa/vizwiz.yml
+# Use soft copy
+task_attributes:
+  vqa:
+    dataset_attributes:
+      vizwiz:
+        use_ocr: true
+        processors:
+          answer_processor:
+            type: soft_copy_answer
+            params:
+              vocab_file: vocabs/answers_vizwiz_7k.txt
+              preprocessor:
+                type: simple_word
+                params: {}
+              context_preprocessor:
+                type: simple_word
+                params: {}
+              max_length: 50
+              num_answers: 10
+model_attributes:
+  lorra: &lorra
+    model_data_dir: ../data
+    metrics:
+    - type: vqa_accuracy
+    losses:
+    - type: logit_bce
+    num_context_features: 1
+    context_feature_dim: 300
+    image_feature_dim: 2048
+    context_max_len: 50
+    classifier:
+      type: logit
+      params:
+        img_hidden_dim: 5000
+        text_hidden_dim: 300
+    image_feature_embeddings:
+    - modal_combine:
+        type: non_linear_element_multiply
+        params:
+          dropout: 0
+          hidden_dim: 5000
+      normalization: softmax
+      transform:
+        type: linear
+        params:
+          out_dim: 1
+    context_feature_embeddings:
+    - modal_combine:
+        type: non_linear_element_multiply
+        params:
+          dropout: 0
+          hidden_dim: 5000
+      normalization: sigmoid
+      transform:
+        type: linear
+        params:
+          out_dim: 1
+    image_feature_encodings:
+    - type: finetune_faster_rcnn_fpn_fc7
+      params:
+        bias_file: detectron/fc6/fc7_b.pkl
+        weights_file: detectron/fc6/fc7_w.pkl
+    - type: default
+      params: {}
+    context_feature_encodings:
+    - type: default
+      params: {}
+    image_text_modal_combine:
+      type: non_linear_element_multiply
+      params:
+        dropout: 0
+        hidden_dim: 5000
+        # 300 for FastText and 50 for order vectors
+        context_dim: 350
+    text_embeddings:
+    - type: attention
+      params:
+        hidden_dim: 1024
+        num_layers: 1
+        conv1_out: 512
+        conv2_out: 2
+        dropout: 0
+        embedding_dim: 300
+        kernel_size: 1
+        padding: 0
+    context_embeddings:
+    - type: identity
+      params:
+        embedding_dim: 350
+  lorra_with_glove: *lorra
+optimizer_attributes:
+  params:
+    eps: 1.0e-08
+    lr: 0.01
+    weight_decay: 0
+  type: Adamax
+training_parameters:
+    clip_norm_mode: all
+    clip_gradients: true
+    max_grad_l2_norm: 0.25
+    lr_scheduler: true
+    lr_steps:
+    - 14000
+    lr_ratio: 0.01
+    use_warmup: true
+    warmup_factor: 0.2
+    warmup_iterations: 1000
+    max_iterations: 24000
+    batch_size: 128
+    num_workers: 7
+    task_size_proportional_sampling: true
+    monitored_metric: vqa_accuracy
+    pretrained_mapping:
+      text_embeddings: text_embeddings
+      image_feature_encoders: image_feature_encoders
+      image_feature_embeddings_list: image_feature_embeddings_list
+      image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
+    metric_minimize: false
diff --git a/configs/vqa/vqa2/lorra.yml b/configs/vqa/vqa2/lorra.yml
@@ -0,0 +1,125 @@
+includes:
+- common/defaults/configs/tasks/vqa/vqa2.yml
+# Use soft copy
+task_attributes:
+  vqa:
+    dataset_attributes:
+      vqa2:
+        use_ocr: true
+        processors:
+          answer_processor:
+            type: soft_copy_answer
+            params:
+              vocab_file: vocabs/answers_vqa.txt
+              preprocessor:
+                type: simple_word
+                params: {}
+              context_preprocessor:
+                type: simple_word
+                params: {}
+              max_length: 50
+              num_answers: 10
+model_attributes:
+  lorra: &lorra
+    model_data_dir: ../data
+    metrics:
+    - type: vqa_accuracy
+    losses:
+    - type: logit_bce
+    num_context_features: 1
+    context_feature_dim: 300
+    image_feature_dim: 2048
+    context_max_len: 50
+    classifier:
+      type: logit
+      params:
+        img_hidden_dim: 5000
+        text_hidden_dim: 300
+    image_feature_embeddings:
+    - modal_combine:
+        type: non_linear_element_multiply
+        params:
+          dropout: 0
+          hidden_dim: 5000
+      normalization: softmax
+      transform:
+        type: linear
+        params:
+          out_dim: 1
+    context_feature_embeddings:
+    - modal_combine:
+        type: non_linear_element_multiply
+        params:
+          dropout: 0
+          hidden_dim: 5000
+      normalization: sigmoid
+      transform:
+        type: linear
+        params:
+          out_dim: 1
+    image_feature_encodings:
+    - type: finetune_faster_rcnn_fpn_fc7
+      params:
+        bias_file: detectron/fc6/fc7_b.pkl
+        weights_file: detectron/fc6/fc7_w.pkl
+    - type: default
+      params: {}
+    context_feature_encodings:
+    - type: default
+      params: {}
+    image_text_modal_combine:
+      type: non_linear_element_multiply
+      params:
+        dropout: 0
+        hidden_dim: 5000
+        # 300 for FastText and 50 for order vectors
+        context_dim: 350
+    text_embeddings:
+    - type: attention
+      params:
+        hidden_dim: 1024
+        num_layers: 1
+        conv1_out: 512
+        conv2_out: 2
+        dropout: 0
+        embedding_dim: 300
+        kernel_size: 1
+        padding: 0
+    context_embeddings:
+    - type: identity
+      params:
+        embedding_dim: 350
+  lorra_with_glove: *lorra
+optimizer_attributes:
+  type: Adamax
+  params:
+    eps: 1.0e-08
+    lr: 0.01
+    weight_decay: 0
+training_parameters:
+  clip_norm_mode: all
+  clip_gradients: true
+  lr_ratio: 0.1
+  lr_scheduler: true
+  lr_steps:
+  - 15000
+  - 18000
+  - 20000
+  - 21000
+  max_grad_l2_norm: 0.25
+  max_iterations: 22000
+  use_warmup: true
+  warmup_factor: 0.2
+  warmup_iterations: 1000
+  patience: 4000
+  batch_size: 512
+  num_workers: 7
+  task_size_proportional_sampling: true
+  monitored_metric: vqa_accuracy
+  pretrained_mapping:
+    text_embeddings: text_embeddings
+    context_embeddings: context_embeddings
+    image_feature_encoders: image_feature_encoders
+    image_feature_embeddings_list: image_feature_embeddings_list
+    image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
+  metric_minimize: false
diff --git a/pythia/common/defaults/configs/tasks/vqa/textvqa.yml b/pythia/common/defaults/configs/tasks/vqa/textvqa.yml
@@ -33,18 +33,13 @@ task_attributes:
                         embedding_name: glove.6B.300d
                         vocab_file: vocabs/vocabulary_100k.txt
                   answer_processor:
-                    type: soft_copy_answer
+                    type: vqa_answer
                     params:
-                      vocab_file: vocabs/answers_textvqa_more_than_1.txt
+                      vocab_file: vocabs/answers_textvqa_8k.txt
                       preprocessor:
                         type: simple_word
                         params: {}
-                      context_preprocessor:
-                        type: simple_word
-                        params: {}
-                      max_length: 50
                       num_answers: 10
-                      use_soft_copy: false
                   context_processor:
                     type: fasttext
                     params:

diff --git a/pythia/common/defaults/configs/tasks/vqa/vizwiz.yml b/pythia/common/defaults/configs/tasks/vqa/vizwiz.yml
@@ -33,18 +33,13 @@ task_attributes:
                         embedding_name: glove.6B.300d
                         vocab_file: vocabs/vocabulary_100k.txt
                   answer_processor:
-                    type: soft_copy_answer
+                    type: vqa_answer
                     params:
                       vocab_file: vocabs/answers_vizwiz_7k.txt
                       preprocessor:
                         type: simple_word
                         params: {}
-                      context_preprocessor:
-                        type: simple_word
-                        params: {}
-                      max_length: 50
                       num_answers: 10
-                      use_soft_copy: false
                   context_processor:
                     type: fasttext
                     params:
@@ -59,7 +54,7 @@ task_attributes:
                       max_length: 50
                 return_info: true
                 # Return OCR information
-                use_ocr: true
+                use_ocr: false
                 # Return spatial information of OCR tokens if present
                 use_ocr_info: false
 training_parameters:

diff --git a/pythia/common/defaults/configs/tasks/vqa/vqa2.yml b/pythia/common/defaults/configs/tasks/vqa/vqa2.yml
@@ -40,7 +40,23 @@ task_attributes:
                       preprocessor:
                         type: simple_word
                         params: {}
+                  context_processor:
+                    type: fasttext
+                    params:
+                      max_length: 50
+                      model_file: .vector_cache/wiki.en.bin
+                  ocr_token_processor:
+                    type: simple_word
+                    params: {}
+                  bbox_processor:
+                    type: bbox
+                    params:
+                      max_length: 50
                 return_info: true
+                # Return OCR information
+                use_ocr: false
+                # Return spatial information of OCR tokens if present
+                use_ocr_info: false
 training_parameters:
     monitored_metric: vqa2_vqa_accuracy
     metric_minimize: false