unpolished code of crossCLR

amazon-science · mzolfaghari · Feb 6, 2022 · Feb 6, 2022 · Feb 6, 2022 · Feb 6, 2022
commit 02d2a74b654b5357af926e8853b09fd1c8ab3fbb
diff --git a/config/lsmdc.yaml b/config/lsmdc.yaml
@@ -0,0 +1,159 @@
+description: "experiment with 100m features."
+random_seed: null
+config_type: "ret"
+training:
+ debug_size: 11000 #Debugging will be deactivated if set it to a number bigger that 10000!
+ loss_func: 'contrastive'
+ batch_size: 64 #64
+ num_epochs: 20
+ compute_clip_retrieval: true
+
+ cross_clr_config:
+ temperature: 0.02
+ temperature_weights: 0.0055
+ score_thrshold: 0.9
+ negative_weight: 0.65 #0.8
+ queue_size: 10000
+
+ contrastive_loss_config:
+ margin: 0.1
+ weight_high: 0.5 #1.0
+ weight_high_cluster: 0 #0.1
+ weight_low: 1
+ weight_low_cluster: 0 #.2
+ weight_context: 0 #0.7 #1.0
+ weight_context_cluster: 0.0 
+ loss_cycle_cons: 0 #0.001
+val:
+ debug_size: 1000 #0 #00 #000 #500 #Debugging will be deactivated if set it to a number bigger that 10000!
+ batch_size: 64
+ val_freq: 1
+ val_start: 2 #0
+ val_clips: true
+ val_clips_freq: 1
+ det_best_field: "val_clip_score_at_1"
+ det_best_compare_mode: "max"
+ det_best_threshold_mode: "rel"
+ det_best_threshold_value: 1e-4
+ det_best_terminate_after: 16
+test:
+ debug_size: 11000 #000 #Debugging will be deactivated if set it to a number bigger that 10000!
+ batch_size: 64
+ test_freq: 1
+ test_start: 1 #0
+ test_clips: true
+ test_clips_freq: 1
+
+
+video_local:
+ name: transformer
+ output_dim: 512
+ input_fc: true
+ input_fc_output_dim: 512
+ selfatn_config:
+ hidden_dim: 512
+ num_layers: 1
+ num_heads: 4
+ pointwise_ff_dim: 512
+ activation: "gelu"
+ dropout: 0.01
+ norm: "layernorm_coot"
+ use_context: true
+ use_subspace: false #deactivated
+ pooler_config:
+ name: atn #atn #avg #max #atn
+ hidden_dim: 512
+ num_heads: 4
+ num_layers: 1
+ dropout: 0.01
+ activation: "gelu"
+ weight_init_type: "truncnorm"
+ weight_init_std: 0.01
+video_global:
+ name: transformer
+ # copy all settings from the local network
+ same_as: "video_local"
+ output_dim: 512
+ input_fc: false
+ use_context: true
+ use_subspace: false
+ crossatn_config:
+ hidden_dim: 512
+ num_layers: 1
+ num_heads: 8
+ pointwise_ff_dim: 512
+ activation: "gelu"
+ dropout: 0.01
+ norm: "layernorm_coot"
+ pooler_config:
+ name: "avg"
+
+text_local:
+ same_as: "video_local"
+text_global:
+ same_as: "video_global"
+dataset:
+ name: lsmdc16
+ modality_feat_name_a: action,scene,appearance,audio,howto100m_finetune,object,flow #list modality names without space options: mmt or any combination of [action, scene, flow, ...]
+ modality_feat_name_b: "text" #_feat_youcook2_meta_all_transformers_bert-base-uncased_-2,-1"
+ train_split: train
+ val_split: val #test 
+ test_split: test1k
+ min_frames: 1
+ use_clips: true
+ max_frames: 10
+ feat_agg_dim: 2048 #1024 #512 # Split features and add them along a new dimension
+ mmt_feat_dim: 1024 #1024
+ action_feat_dim: 2048 #1024
+ scene_feat_dim: 2208 #1024
+ appearance_feat_dim: 2048 #1024
+ flow_feat_dim: 529 #1024
+ object_feat_dim: 256 #1024
+ audio_feat_dim: 2048
+ howto100m_finetune_feat_dim: 512
+ text_feat_dim: 1536
+ frames_noise: 0
+ word_emb: bert
+ # video feature loading
+ add_stop_frame: 2
+ expand_segments: 0
+ # technical dataloading details
+ preload_data: true
+ pin_memory: true
+ num_workers: 4
+ drop_last: false
+optimizer:
+ name: radam
+ lr: 0.0006
+ weight_decay: 0 #3.0e-05 #0
+ weight_decay_for_bias: true
+ lr_decay_mult: false
+ momentum: 0.56
+ adam_beta2: 0.98
+ adam_eps: 1.5e-09 #1.5e-09
+scheduler:
+ patience: 6
+ cooldown: 4
+ warmup: 2
+
+# ---------- Logging / Saving ----------
+logging:
+ step_train: 10
+ step_val: 10
+ step_test: 10 #if -1 then ignore testing
+ step_gpu: -1
+ step_gpu_once: 10
+saving:
+ keep_freq: -1
+ save_last: true
+ save_best: true
+ save_opt_state: true
+# ---------- Technical PyTorch settings ----------
+use_cuda: true
+use_multi_gpu: true
+cudnn_enabled: true
+cudnn_benchmark: true
+cudnn_deterministic: false
+cuda_non_blocking: true
+fp16_train: true
+fp16_val: true
diff --git a/data/lsmdc/annot/LSMDC16_annos_test.csv b/data/lsmdc/annot/LSMDC16_annos_test.csv
diff --git a/data/lsmdc/annot/LSMDC16_annos_training.csv b/data/lsmdc/annot/LSMDC16_annos_training.csv
diff --git a/data/lsmdc/annot/LSMDC16_annos_val.csv b/data/lsmdc/annot/LSMDC16_annos_val.csv
diff --git a/data/lsmdc/annot/LSMDC16_annos_val_tmp.csv b/data/lsmdc/annot/LSMDC16_annos_val_tmp.csv
diff --git a/data/lsmdc/annot_new/LSMDC16_challenge_1000_publictect.csv b/data/lsmdc/annot_new/LSMDC16_challenge_1000_publictect.csv
diff --git a/data/lsmdc/annot_new/LSMDC16_multiple_choice_test_randomized.csv b/data/lsmdc/annot_new/LSMDC16_multiple_choice_test_randomized.csv
diff --git a/data/lsmdc/annot_new/LSMDC16_multiple_choice_train.csv b/data/lsmdc/annot_new/LSMDC16_multiple_choice_train.csv
diff --git a/data/lsmdc/annot_new/LSMDC16_multiple_choice_valid.csv b/data/lsmdc/annot_new/LSMDC16_multiple_choice_valid.csv
diff --git a/data/lsmdc/annot_new/LSMDC16_paraphrase_test.csv b/data/lsmdc/annot_new/LSMDC16_paraphrase_test.csv
diff --git a/data/lsmdc/annot_new/LSMDC16_paraphrase_train_subset.csv b/data/lsmdc/annot_new/LSMDC16_paraphrase_train_subset.csv
diff --git a/data/lsmdc/annot_new/README_MC.txt b/data/lsmdc/annot_new/README_MC.txt
@@ -0,0 +1,73 @@
+==================================================================
+Large Scale Movie Description and Understanding Challenge (LSMDC), at ECCV 2016
+==================================================================
+Movie Multiple-Choice Test
+==================================================================
+
+Get the Linux/Mac download script (downloadChallengeDataMC.sh) and 
+copy it to the location where you want to save the files and then run:
+ downloadChallengeDataMC.sh <username-MPIIMD> <password-MPIIMD>
+
+Note: Instructions how to obtain the username/password information 
+are here: https://sites.google.com/site/describingmovies/lsmdc-2016/download
+
+In the following:
+M-VAD: Montreal Video Annotation Dataset [1]
+MPII-MD: MPII Movie Description dataset [2]
+
+==================================================================
+Annotations
+==================================================================
+
+= Files
+- Training: LSMDC16_multiple_choice_train.csv
+- Validation: LSMDC16_multiple_choice_valid.csv
+- Public test: LSMDC16_multiple_choice_test_randomized.csv
+
+= Format
+-
+- Each line of the annotation *.csv file contains:
+ - For LSMDC16_multiple_choice_train.csv and LSMDC16_multiple_choice_valid.csv:
+ <CLIP_ID>\t<START_ALIGNED>\t<END_ALIGNED>\t<START_EXTRACTED>\t<END_EXTRACTED>\t<CORRECT_SENTENCE>\t<DISTRACTOR_SENTENCE1>..
+ \t<DISTRACTOR_SENTENCE2>\t<DISTRACTOR_SENTENCE3>\t<DISTRACTOR_SENTENCE4>
+
+ - For LSMDC16_multiple_choice_test_randomized.csv:
+ <CLIP_ID>\t<START_ALIGNED>\t<END_ALIGNED>\t<START_EXTRACTED>\t<END_EXTRACTED>\t<SENTENCE1>\t<SENTENCE2>..
+ \t<SENTENCE3>\t<SENTENCE4>\t<SENTENCE5>\t<ANSWER_INDEX(between 1-5)> 
+
+ where "\t" is a TAB character, <START_*> and <END_*> are time-stamps "hh.mm.ss.msmsms" (e.g. 01.02.27.034).
+ Note, that in case where the manually aligned video clip is shorter than 2 seconds, we symmetrically 
+ expand it (from beginning and end) to be exactly 2 seconds long. Thus, <START_ALIGNED> and <END_ALIGNED>
+ correspond to precise manually obtained time-stamps, while <START_EXTRACTED>, <END_EXTRACTED> indicate
+ the actual extracted clip's start and end.
+
+- <CORRECT_SENTENCE> is ground-truth caption or annotation for corresponding video
+- <DISTRACTOR_SENTENCE> is a distractor annotation that randomly have been picked from the corpus with the condition that they are labeled with different activity-phrase label than the correct answer. 
+However objects, location, some context might be the similar in both correct sentence and distractor sentences. But the human activities are organized in the way to be different from correct answer
+as much as possible.
+- <SENTENCE> in public test could be either CORRECT_SENTENCE or DISTRACTOR_SENTENCE
+- <ANSWER_INDEX> in public test in index of correct answer which is a number between 1 to 5 
+
+- The task is multiple-choice test: given a video query and 5 sentence choices, retrieve the correct sentence for the video among 5 answer choices (i.e. 1 correct answer and 4 distractor captions) 
+
+= Statistics
+- Training: 101,079
+- validation: 7,408
+- Public Test: 10,053 
+
+==================================================================
+[1]
+@article{AtorabiM-VAD2015,
+author = {Torabi, Atousa and Pal, Chris and Larochelle, Hugo and Courville, Aaron},
+title = {Using Descriptive Video Services To Create a Large Data Source For Video Annotation Research},
+journal = {arXiv preprint},
+year = {2015},
+url = {https://arxiv.org/pdf/1503.01070v1.pdf}}
+
+[2]
+@inproceedings{rohrbach15cvpr,
+title={A Dataset for Movie Description},
+author={Rohrbach, Anna and Rohrbach, Marcus and Tandon, Niket and Schiele, Bernt},
+booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+url = {https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Rohrbach_A_Dataset_for_2015_CVPR_paper.pdf}
+year={2015}}
diff --git a/data/lsmdc/annot_new/README_PP.txt b/data/lsmdc/annot_new/README_PP.txt
@@ -0,0 +1,60 @@
+==================================================================
+Large Scale Movie Description and Understanding Challenge (LSMDC), at ECCV 2016
+==================================================================
+Movie Annotation and Retrieval
+==================================================================
+
+Get the Linux/Mac download script (downloadChallengeDataMAR.sh) and 
+copy it to the location where you want to save the files and then run:
+ downloadChallengeDataMAR.sh <username-MPIIMD> <password-MPIIMD>
+
+Note: Instructions how to obtain the username/password information 
+are here: https://sites.google.com/site/describingmovies/lsmdc-2016/download
+
+In the following:
+M-VAD: Montreal Video Annotation Dataset [1]
+MPII-MD: MPII Movie Description dataset [2]
+
+==================================================================
+Annotations
+==================================================================
+
+= Files
+- Training: LSMDC16_paraphrased_train_subset.txt
+- Validation: None (original sentences from movie description could be used)
+- Public test: LSMDC16_paraphrased_test.txt
+
+= Format
+- Each line of the annotation *.csv file contains:
+ - Each line of the annotation *.csv file contains:
+ <CLIP_ID>\t<START_ALIGNED>\t<END_ALIGNED>\t<START_EXTRACTED>\t<END_EXTRACTED>\t<SENTENCE>\t<PARAPHRASED SENTENCE>
+ where "\t" is a TAB character, <START_*> and <END_*> are time-stamps "hh.mm.ss.msmsms" (e.g. 01.02.27.034).
+ Note, that in case where the manually aligned video clip is shorter than 2 seconds, we symmetrically 
+ expand it (from beginning and end) to be exactly 2 seconds long. Thus, <START_ALIGNED> and <END_ALIGNED>
+ correspond to precise manually obtained time-stamps, while <START_EXTRACTED>, <END_EXTRACTED> indicate
+ the actual extracted clip's start and end.
+- <SENTENCE> is a complete reference sentence
+- <PARAPHRASED SENTENCE> is a sentence that is the summarized or main aspect of what is described in the original sentence containing 3-10 words
+- The task is to train a model for video annotation and retrieval.
+
+= Statistics
+- Training: 19,948 (subset of training data "sentences longer than ~15 words" )
+- Public Test: 10,053 
+
+==================================================================
+
+[1]
+@article{AtorabiM-VAD2015,
+author = {Torabi, Atousa and Pal, Chris and Larochelle, Hugo and Courville, Aaron},
+title = {Using Descriptive Video Services To Create a Large Data Source For Video Annotation Research},
+journal = {arXiv preprint},
+year = {2015},
+url = {https://arxiv.org/pdf/1503.01070v1.pdf}}
+
+[2]
+@inproceedings{rohrbach15cvpr,
+title={A Dataset for Movie Description},
+author={Rohrbach, Anna and Rohrbach, Marcus and Tandon, Niket and Schiele, Bernt},
+booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+url = {https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Rohrbach_A_Dataset_for_2015_CVPR_paper.pdf}
+year={2015}}
diff --git a/data/lsmdc/annot_new/downloadChallengeDataMC.sh b/data/lsmdc/annot_new/downloadChallengeDataMC.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+EXPECTED_ARGS=2
+E_BADARGS=65
+
+if [ $# -lt $EXPECTED_ARGS ]
+then
+ echo "Usage: `basename $0` <username-MPIIMD> <password-MPIIMD> [parallelDownloads=8]"
+ exit $E_BADARGS
+fi
+usernameMD=$1
+passwordMD=$2
+parallelDownloads=$3
+
+if [ $# -lt 3 ]
+then
+ parallelDownloads=8
+fi
+
+########## download annotations
+# Training set
+wget https://datasets.d2.mpi-inf.mpg.de/movieDescription/protected/lsmdc2016/LSMDC16_multiple_choice_train.csv --user=$usernameMD --password=$passwordMD
+# Validation set
+wget https://datasets.d2.mpi-inf.mpg.de/movieDescription/protected/lsmdc2016/LSMDC16_multiple_choice_valid.csv --user=$usernameMD --password=$passwordMD
+# Public_Test set randomized
+wget https://datasets.d2.mpi-inf.mpg.de/movieDescription/protected/lsmdc2016/LSMDC16_multiple_choice_test_randomized.csv --user=$usernameMD --password=$passwordMD
diff --git a/data/lsmdc/toy/group5/language_features/text_default.h5 b/data/lsmdc/toy/group5/language_features/text_default.h5
diff --git a/data/lsmdc/toy/group5/language_features/text_lens_default.json b/data/lsmdc/toy/group5/language_features/text_lens_default.json
@@ -0,0 +1 @@
+{"16392": [7, 9, 17, 4, 18], "21014": [33, 8, 8, 8, 5], "15197": [6, 5, 6, 8, 14], "18044": [10, 14, 13, 10, 18], "23004": [15, 12, 14, 10, 8], "20275": [9, 12, 8, 6, 8], "13565": [9, 17, 6, 12, 20], "10801": [14, 22, 8, 13, 10], "4594": [6, 5, 15, 11, 12], "17875": [10, 4, 11, 6, 7], "7845": [9, 7, 6, 9, 13], "13331": [13, 15, 4, 9, 3], "22745": [9, 8, 23, 14, 5], "14936": [8, 3, 20, 9, 11], "17965": [16, 19, 11, 16, 9], "11940": [14, 14, 9, 5, 3], "9215": [23, 23, 13, 10, 6], "1334": [11, 5, 14, 11, 11], "5755": [12, 18, 5, 7, 22], "6303": [9, 13, 12, 7, 8], "16520": [11, 3, 6, 11, 12], "1898": [13, 18, 14, 6, 8], "7196": [7, 7, 8, 16, 21], "9695": [21, 16, 9, 21, 10], "8382": [9, 9, 9, 16, 13], "18905": [7, 11, 11, 8, 12], "6073": [12, 8, 6, 28, 21], "4825": [10, 5, 10, 8, 7], "23078": [18, 8, 18, 14, 12], "11106": [22, 5, 20, 17, 14]}
diff --git a/data/lsmdc/toy/group5/video_features/howto100m.h5 b/data/lsmdc/toy/group5/video_features/howto100m.h5
diff --git a/data/lsmdc/toy/group5/video_features/howto_h100m.h5 b/data/lsmdc/toy/group5/video_features/howto_h100m.h5