Changed Dockerfile Environment names, updated Readme and Entrypoint Documentation

lapplislazuli · lapplislazuli · commit 7baec718ddc9 · 2022-03-03T17:10:34.000+01:00
diff --git a/Dockerfile b/Dockerfile
@@ -22,9 +22,9 @@ WORKDIR /experiment/code
 
 # Be careful to not add comments after the env variables - they will be added to the string 
 
-ENV do_train true
-ENV do_val true
-ENV do_test true
+ENV DO_TRAIN true
+ENV DO_VALID true
+ENV DO_TEST true
 
 ENV lang java
 ENV lr 5e-5
@@ -35,12 +35,12 @@ ENV target_length 128
 ENV data_dir /dataset
 ENV output_dir /experiment/output
 ENV train_file $data_dir/train_minimal.jsonl
-ENV dev_file $data_dir/valid_minimal.jsonl
+ENV valid_file $data_dir/valid_minimal.jsonl
 ENV test_file $data_dir/test_minimal.jsonl
 ENV epochs 10 
 ENV pretrained_model microsoft/codebert-base
 
 ENV load_existing_model false
-#ENV load_model_path /models/pytorch_model.bin
+ENV load_model_path /models/pytorch_model.bin
 
 ENTRYPOINT ["bash","./entrypoint.sh"]
diff --git a/README.md b/README.md
@@ -2,8 +2,7 @@
 
 This repository holds a docker image which reproduces [Microsofts CodeBERT Code-To-Text Experiment](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text).
 
-The subparts have been minimally changed (see [changes](./changes.md)), but mostly it is just wrapping the experiment in a cpu-based docker image. 
-There is currently no GPU-Image.
+The subparts have been minimally changed (see [changes](./changes.md)), but mostly it is just wrapping the experiment in a docker-image.
 
 The initial readme can be [found here](./initial_readme.md).
 
@@ -14,6 +13,9 @@ The shell file runs the instructions from the initial readme and adds some more
 It worked flawlessly for me on a mac, so I did not want to make extra docker image for data-preprocessing. 
 Depending on your distribution, you might need to install things like wget.
 
+**Note:** The step before is necessary! the `dataset.zip` only contains references to the dataset and is *unfolded* in `prepare.sh`. 
+
+
 After that, change the docker-compose to point to your files (including filenames) and set environment variables as fit. 
 
 You can build the docker file beforehand using 
@@ -92,4 +94,5 @@ CodeBert_CodeToText_Experiment_0_1  | ./entrypoint.sh: line 14: $'\r': command n
 CodeBert_CodeToText_Experiment_0_1  | ./entrypoint.sh: line 200: syntax error: unexpected end of file
 ```
 This is due to windows changing the line-breaks / file encodings. Thanks windows. 
+**Easy Solution**: run `dos2unix entrypoint.sh` and rebuild the container. 
 Its might easier/faster to pull the image from this repository, or you have to [edit the entrypoint to be compatible with windows](https://askubuntu.com/questions/966488/how-do-i-fix-r-command-not-found-errors-running-bash-scripts-in-wsl). 
diff --git a/docker-compose-minimal.yml b/docker-compose-minimal.yml
@@ -9,5 +9,5 @@ services:
     environment:
       epochs: 10
       train_file: /dataset/train_minimal.jsonl
-      dev_file: /dataset/valid_minimal.jsonl
+      valid_file: /dataset/valid_minimal.jsonl
       test_file: /dataset/test_minimal.jsonl
diff --git a/docker-compose-pretrained-minimal.yml b/docker-compose-pretrained-minimal.yml
@@ -15,7 +15,7 @@ services:
       do_train: "false"
       do_val: "true"
       do_test: "true"
-      dev_file: /dataset/valid_minimal.jsonl
+      valid_file: /dataset/valid_minimal.jsonl
       test_file: /dataset/test_minimal.jsonl
       no_cuda: "true"
       pretrained_model: microsoft/codebert-base
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -13,6 +13,6 @@ services:
     environment:
       epochs: 5
       train_file: /dataset/train.jsonl
-      dev_file: /dataset/valid.jsonl
+      valid_file: /dataset/valid.jsonl
       test_file: /dataset/test.jsonl
       batch_size: 8
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -4,10 +4,6 @@
 
 # This file invokes the original python code of the codebert text with the environment variables set in the docker container. 
 # Additionally, it does a switch-case which flags for training, validation and testing have been set 
-# And it uses an anaconda environment to provide the dependencies. 
-
-# Without Anacondas --no-capture-output flag the system prints from the run.py would be hidden until the anaconda process exits. This flag is optional but highly helpful. 
-# Anacondas "-n" parameter specifies which conda-env is used to run the script. It must match the name provided in 'environment.yml'.
 
 # The use of exit without a number returns the exit code of the fore-going statement - that is in this case the anaconda command. 
 # The Exit codes are necessary, as otherwise all cases are run (atleast, all cases with flags set). 
@@ -21,12 +17,12 @@
 if [ "$load_existing_model" = true ]; then 
     echo "Found flag to load a model under $load_model_path"
 
-    if [ "$do_train" = true -a "$do_test" = true -a "$do_val" = true ]; then
+    if [ "$DO_TRAIN" = true -a "$DO_TEST" = true -a "$DO_VALID" = true ]; then
         echo "performing full run with training, validation and test"
         python ./run.py \
             --do_train --do_test --do_eval \
             --model_type roberta --model_name_or_path $pretrained_model \
-            --train_filename $train_file --test_filename $test_file --dev_filename $dev_file \
+            --train_filename $train_file --test_filename $test_file --dev_filename $valid_file \
             --output_dir $output_dir \
             --max_source_length $source_length \
             --max_target_length $target_length \
@@ -37,12 +33,12 @@ if [ "$load_existing_model" = true ]; then
             --load_model_path $load_model_path
         exit
     fi
-    if [ "$do_train" = true -a "$do_val" = true ]; then
+    if [ "$DO_TRAIN" = true -a "$DO_VALID" = true ]; then
         echo "performing run with training and validation"
         python ./run.py \
             --do_train --do_eval \
             --model_type roberta --model_name_or_path $pretrained_model \
-            --train_filename $train_file --dev_filename $dev_file \
+            --train_filename $train_file --dev_filename $valid_file \
             --output_dir $output_dir \
             --max_source_length $source_length \
             --max_target_length $target_length \
@@ -53,7 +49,7 @@ if [ "$load_existing_model" = true ]; then
             --num_train_epochs $epochs
         exit
     fi
-    if [ "$do_train" = true -a "$do_test" = true ]; then
+    if [ "$DO_TRAIN" = true -a "$DO_TEST" = true ]; then
         echo "performing run with training and test"
         python ./run.py \
             --do_train --do_test \
@@ -69,7 +65,7 @@ if [ "$load_existing_model" = true ]; then
             --load_model_path $load_model_path
         exit
     fi
-    if [ "$do_train" = true ]; then
+    if [ "$DO_TRAIN" = true ]; then
         echo "performing run with (only) training"
         python ./run.py \
             --do_train \
@@ -86,7 +82,7 @@ if [ "$load_existing_model" = true ]; then
             --load_model_path $load_model_path
         exit 0
     fi
-    if [ "$do_test" = true ]; then
+    if [ "$DO_TEST" = true ]; then
         echo "performing run with (only) testing"
         python ./run.py \
             --do_test \
@@ -106,12 +102,12 @@ fi
 #        Case 2: No Pretrained Model 
 # ============================================
 
-if [ "$do_train" = true -a "$do_test" = true -a "$do_val" = true ]; then
+if [ "$DO_TRAIN" = true -a "$DO_TEST" = true -a "$DO_VALID" = true ]; then
     echo "performing full run with training, validation and test"
     python ./run.py \
         --do_train --do_test --do_eval \
         --model_type roberta --model_name_or_path $pretrained_model \
-        --train_filename $train_file --test_filename $test_file --dev_filename $dev_file \
+        --train_filename $train_file --test_filename $test_file --dev_filename $valid_file \
         --output_dir $output_dir \
         --max_source_length $source_length \
         --max_target_length $target_length \
@@ -121,12 +117,12 @@ if [ "$do_train" = true -a "$do_test" = true -a "$do_val" = true ]; then
         --num_train_epochs $epochs
     exit
 fi
-if [ "$do_train" = true -a "$do_val" = true ]; then
+if [ "$DO_TRAIN" = true -a "$DO_VALID" = true ]; then
     echo "performing run with training and validation"
     python ./run.py \
         --do_train --do_eval \
         --model_type roberta --model_name_or_path $pretrained_model \
-        --train_filename $train_file --dev_filename $dev_file \
+        --train_filename $train_file --dev_filename $valid_file \
         --output_dir $output_dir \
         --max_source_length $source_length \
         --max_target_length $target_length \
@@ -137,7 +133,7 @@ if [ "$do_train" = true -a "$do_val" = true ]; then
     exit
 fi
 
-if [ "$do_train" = true -a "$do_test" = true ]; then
+if [ "$DO_TRAIN" = true -a "$DO_TEST" = true ]; then
     echo "performing run with training and test"
     python ./run.py \
         --do_train --do_test \
@@ -152,7 +148,7 @@ if [ "$do_train" = true -a "$do_test" = true ]; then
         --num_train_epochs $epochs
     exit
 fi
-if [ "$do_train" = true ]; then
+if [ "$DO_TRAIN" = true ]; then
     echo "performing run with (only) training"
     python ./run.py \
         --do_train \
@@ -168,7 +164,7 @@ if [ "$do_train" = true ]; then
         --num_train_epochs $epochs
     exit 0
 fi
-if [ "$do_test" = true ]; then
+if [ "$DO_TEST" = true ]; then
     echo "performing run with (only) testing"
     python ./run.py \
         --do_test \
@@ -182,7 +178,9 @@ if [ "$do_test" = true ]; then
     exit
 fi
 
-# Case 3: Error / Unknown 
+# ===================================
+#     Case 3: Error / Unknown 
+# ===================================
 
 echo "no flags set - please inspect your compose"
 exit 1