-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_post_processing_ccc.sh
More file actions
executable file
·77 lines (63 loc) · 2.58 KB
/
run_post_processing_ccc.sh
File metadata and controls
executable file
·77 lines (63 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
################################################################################
# Run all relevant post processing scripts for "Jupyter Notebooks on GitHub:
# Characteristics and Code Clones", that is scripts for processing output of
# NotebookAnalyzer run with the flag "--ccc", and output of SccOutputAnalyzer.
################################################################################
source paths.sh
# PREPARATIONS:
# If a notebook is written in Python and is not included in the clone frequency
# file in outputSOA, it is empty. Find these and add to cloneFrequency and
# connections files in outputSOA (since empty notebooks are not included among
# SourcererCC clones).
ln -s $outputNBA Output
cd Scripts
pythonNotebooks="../pythonNotebooks.txt"
langFile=`./get_last_output.sh "languages"`
grep -E "^nb_[0-9]+\.ipynb\, PYTHON\, " $langFile | cut -d',' -f1 > $pythonNotebooks
cd ..
rm Output
ln -s $outputSOA Output
cd Scripts
sccNotebooks="sccNotebooks.txt"
cloneFreqSCC=`./get_last_output.sh "cloneFrequency"`
connectionsSCC=`./get_last_output.sh "connections"`
sed -n "2,$ p" $cloneFreqSCC | cut -d',' -f1 > $sccNotebooks
grep -vFf $sccNotebooks $pythonNotebooks | while read notebook; do
echo "$notebook, 0, 0, 0, 0, 0, 0, 0" >> $cloneFreqSCC
echo "$notebook, 0, 0.0000, 0, 0.0000" >> $connectionsSCC
done
rm $pythonNotebooks
rm $sccNotebooks
# NEAR-MISS-CLONES:
# Postprocessing for SourcererCC result
./clone_analysis_scc.sh > ../Output/output_clone_analysis.txt
#Statistics for SccOutputAnanlyzer results
./create_sym_links_scc.sh
Rscript statistics_ccc_scc.R > ../Output/output_statistics.txt
# Pack plots
./reduce_large_images.sh
cd ../Output
tar -czf plots.tgz hist_clone_frequency*eps log_hist_*eps cells*jpg *Inter_*jpg
cd ..
rm Output
# CMW CLONES AND STATISTICS FOR THE CORPUS:
# Postprocessing of NotebookAnalyzer result
ln -s $outputNBA Output
cd Scripts
./language_analysis.sh > ../Output/output_language_analysis.txt
./language_inconsistencies.sh > ../Output/output_language_inconsistencies.txt
./clone_analysis_nba.sh > ../Output/output_clone_analysis.txt
./print_most_common_snippets.sh 100 0 1 > ../Output/top100clones_min0.txt
./print_most_common_snippets.sh 100 4 1 > ../Output/top100clones_min4loc.txt
./list_duplicated_notebooks.sh # takes 2 days on fxpl-stat
# Statistics for NotebookAnalyzer results
./create_sym_links_nba.sh
./get_notebook_sizes.sh
Rscript statistics_ccc_nba.R > ../Output/output_statistics.txt
# Pack plots
./reduce_large_images.sh
cd ../Output
tar -czf plots.tgz hist_clone_frequency*eps lang*eps log_hist_*eps cells*jpg *Inter_*jpg
cd ..
rm Output