twinspect/config.yml at main · iscc/twinspect · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
twinspect: 0.1.0
algorithms:
  - name: ISCC Text-Code V0 64-Bit
    label: text_code_v0_64
    mode: text
    function: twinspect.algos.iscc:text_code_v0_64
    info: The ISCC Text-Code is a similarity preserving hash designed to cluster and match
      near-duplicate text documents that have undergone format conversion or minor edits.
    url: https://github.com/iscc/iscc-core/blob/main/iscc_core/code_content_text.py
    dependencies:
      - iscc-sdk>=0.8.5
  - name: ISCC Image-Code V0 64-Bit
    label: image_code_v0_64
    mode: image
    function: twinspect.algos.iscc:image_code_v0_64
    info: The ISCC Image-Code is a similarity preserving perceptual hash designed to cluster
      and match near-duplicate images that have undergone format conversion or minor edits.
    url: https://github.com/iscc/iscc-core/blob/main/iscc_core/code_content_image.py
    dependencies:
      - iscc-sdk>=0.8.5
  - name: ISCC Audio-Code V0 64-Bit
    label: audio_code_v0_64
    mode: audio
    function: twinspect.algos.iscc:audio_code_v0_64
    info: The ISCC Audio-Code is a similarity preserving hash based on chromaprint and designed
      to cluster and match near-duplicate audio files that have undergone format conversion,
      transcoding, compression and other minor edits.
    url: https://github.com/iscc/iscc-core/blob/main/iscc_core/code_content_audio.py
    dependencies:
      - iscc-sdk>=0.8.5
  - name: ISCC Video-Code V0 64-Bit
    label: video_code_v0_64
    mode: video
    function: twinspect.algos.iscc:video_code_v0_64
    info: The ISCC Video-Code is a similarity preserving hash based on the MPEG-7 Video
      Signature and is designed to cluster and match near-duplicate videos that have undergone
      format conversion or minor edits.
    url: https://github.com/iscc/iscc-core/blob/main/iscc_core/code_content_video.py
    dependencies:
      - iscc-sdk>=0.8.5
  - name: ISCC Image-Code-S 64-Bit
    label: image_code_s_64
    mode: image
    function: twinspect.algos.iscc:image_code_s_64
    info: The ISCC Image-Code-S is a semantic image similarity hash based on a deep neural
      network (DINOv2). It is designed to cluster and match semantically similar images
      independent of visual transformations.
    url: https://github.com/iscc/iscc-sci
    dependencies:
      - iscc-sci
      - onnxruntime-gpu
  - name: ISCC Image-Code-SC 128-Bit
    label: image_code_sc_128
    mode: image
    function: twinspect.algos.ensemble:placeholder
    info: Combined 128-bit image code concatenating the 64-bit semantic code (iscc-sci/DINOv2)
      with the 64-bit perceptual code (iscc-sdk). Captures both semantic similarity and visual
      characteristics.
    url: https://github.com/iscc/twinspect
    dependencies: []
    ensemble_of:
      - image_code_s_64
      - image_code_v0_64
  - name: ISCC Text-Code-S 64-Bit
    label: text_code_s_64
    mode: text
    function: twinspect.algos.iscc:text_code_s_64
    info: The ISCC Text-Code-S is a semantic text similarity hash based on a deep neural
      network (DeBERTa). It is designed to cluster and match semantically similar text
      independent of textual transformations, supporting cross-lingual similarity matching.
    url: https://github.com/iscc/iscc-sct
    dependencies:
      - iscc-sct
      - onnxruntime-gpu
  - name: ISCC Text-Code-SC 128-Bit
    label: text_code_sc_128
    mode: text
    function: twinspect.algos.ensemble:placeholder
    info: Combined 128-bit text code concatenating the 64-bit semantic code (iscc-sct/DeBERTa)
      with the 64-bit perceptual code (iscc-sdk). Captures both semantic similarity and textual
      characteristics.
    url: https://github.com/iscc/twinspect
    dependencies: []
    ensemble_of:
      - text_code_s_64
      - text_code_v0_64
datasets:
  - name: MIRFLICKR-MFND
    label: mirflickr_mfnd
    info: The MFND benchmark ([Connor et al.,
      2015](http://dx.doi.org/10.5220/0005359705650571)) is a subset of the real-world
      MIRFLICKR dataset ([Huiskes & Lew, 2008](https://doi.org/10.1145/1460096.1460104)) with
      [annotations](http://www.mir-flickr-near-duplicates.appspot.com/) for near duplicates
      (IND). The Twinspect benchmark automatically downloads and reproduces the tested dataset.
    url: https://mfnd.similarity.eu/data/truthfiles/polito/IND_clusters.txt
    mode: image
    installer: twinspect.datasets.mfnd:install
  - name: ISCC-FMA-10k
    label: iscc_fma_10k
    info: The ISCC-FMA-10k benchmark is a subset of [Free Music Archive
      Dataset](https://doi.org/10.48550/arXiv.1612.01840). The subset is generated by
      collecting 5000 random audio files (longer than 60 seconds). Additionaly 10 synthetic
      transformations are applied to a random selection 500 of the audio files. The Twinspect
      benchmark automatically downloads and reproduces the tested dataset.
    url: https://os.unil.cloud.switch.ch/fma/fma_full.zip
    mode: audio
    installer: twinspect.datasets.fma:install
    samples: 5000
    clusters: 500
    seed: 0
  - name: NewsEdits-Reuters-1000
    label: newsedits_reuters_1000
    info: 1000-cluster subset of the [NewsEdits](https://github.com/isi-nlp/NewsEdits) Reuters
      dataset containing news article revisions (2012-2020). Filtered using LCS similarity
      (>=70%) and length variation (<=20%) to ensure genuine near-duplicates. Cross-cluster
      deduplication prevents duplicate articles.
    url: https://github.com/isi-nlp/NewsEdits
    mode: text
    installer: twinspect.datasets.newsedits:install
    clusters: 1000
    min_content_length: 1000
    max_length_variation: 0.20
    min_text_overlap: 0.70
transformations:
  - name: Trim 1 Second
    label: trim-1s-both
    info: Remove 1 seconds of audio from start and end
    mode: audio
    function: twinspect.transformations.audio:trim
    params:
      - 1
      - both
  - name: Trim 5 Seconds
    label: trim-5s-both
    info: Remove 5 seconds of audio from start and end
    mode: audio
    function: twinspect.transformations.audio:trim
    params:
      - 5
      - both
  - name: Fade 8 Seconds
    label: fade-8s-both
    info: Fade in/out 8 seconds at start and end
    mode: audio
    function: twinspect.transformations.audio:fade
    params:
      - 8
      - both
  - name: Transcode MP3 128kbps
    label: transcode-mp3-128kbps
    info: Transcode audio to 128kbps MP3
    mode: audio
    function: twinspect.transformations.audio:transcode
    params:
      - mp3
      - 128
  - name: Transcode OGG 64kbps
    label: transcode-ogg-64kbps
    info: Transcode audio to 64kbps OGG
    mode: audio
    function: twinspect.transformations.audio:transcode
    params:
      - ogg
      - 64
  - name: Transcode AAC 32kbps
    label: transcode-aac-32kbps
    info: Transcode audio to 32kbps AAC
    mode: audio
    function: twinspect.transformations.audio:transcode
    params:
      - aac
      - 32
  - name: Compress
    label: compress-medium
    info: Apply audio compression (attack 10, release 200, ratio 3, threshold -20)
    mode: audio
    function: twinspect.transformations.audio:compress
    params:
      - medium
  - name: Equalize
    label: equalize
    info: Equalize audio (ffmpeg equalizer=f=1000:t=o:w=200:g=10)
    mode: audio
    function: twinspect.transformations.audio:equalize
  - name: Echo
    label: echo
    info: Apply echo effect (ffmpeg aecho=0.8:0.7:60:0.2)
    mode: audio
    function: twinspect.transformations.audio:echo
  - name: Loudness Normalization
    label: loudnorm
    info: Apply loudness normalization (ffmpeg loudnorm=I=-16:TP=-1.5:LRA=11)
    mode: audio
    function: twinspect.transformations.audio:loudnorm
metrics:
  - name: Execution Speed
    label: speed
    function: twinspect.metrics.speed:speed
  - name: Effectiveness (Precission, Recall, F1-Score)
    label: effectiveness
    function: twinspect.metrics.eff:effectiveness
  - name: Robustness against transformations
    label: robustness
    function: twinspect.metrics.robustness:robustness
  - name: Distribution (All-Pairs Hamming Distances)
    label: distribution
    function: twinspect.metrics.distribution:distribution
benchmarks:
  - algorithm_label: image_code_v0_64
    dataset_label: mirflickr_mfnd
    metric_labels:
      - speed
      - effectiveness
      - distribution
    active: true
  - algorithm_label: image_code_s_64
    dataset_label: mirflickr_mfnd
    metric_labels:
      - speed
      - effectiveness
      - distribution
    active: true
  - algorithm_label: image_code_sc_128
    dataset_label: mirflickr_mfnd
    metric_labels:
      - speed
      - effectiveness
      - distribution
    active: true
  - algorithm_label: audio_code_v0_64
    dataset_label: iscc_fma_10k
    metric_labels:
      - speed
      - effectiveness
      - robustness
      - distribution
    active: true
  - algorithm_label: text_code_v0_64
    dataset_label: newsedits_reuters_1000
    metric_labels:
      - speed
      - effectiveness
      - distribution
    active: true
  - algorithm_label: text_code_s_64
    dataset_label: newsedits_reuters_1000
    metric_labels:
      - speed
      - effectiveness
      - distribution
    active: true
  - algorithm_label: text_code_sc_128
    dataset_label: newsedits_reuters_1000
    metric_labels:
      - speed
      - effectiveness
      - distribution
    active: true