Skip to content

Commit 598682a

Browse files
committed
Changed all generated annotations to line annotations
1 parent b7f2937 commit 598682a

1 file changed

Lines changed: 64 additions & 53 deletions

File tree

  • SecurityKeywordsBasedSearchTool/SecFeatFinder

SecurityKeywordsBasedSearchTool/SecFeatFinder/main.py

Lines changed: 64 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def flatten_keywords(keyword_dict):
2525
return flattened
2626

2727

28-
def process_feature_annotations(features_file, repo_dir, flattened_keywords, taxonomy, fm):
28+
def add_api_feature_annotations(features_file, repo_dir, flattened_keywords, taxonomy, fm):
2929
if os.path.exists(features_file):
3030
with open(features_file, "r") as file:
3131
data = json.load(file)
@@ -34,50 +34,66 @@ def process_feature_annotations(features_file, repo_dir, flattened_keywords, tax
3434
sys.exit(1)
3535

3636
library_features = set()
37+
line_annotations = defaultdict(dict)
3738

3839
for source in data.get('sources', []):
39-
for feature in source.get('files', []):
40-
file_path = os.path.join(repo_dir, feature.get('path', ''))
41-
if not os.path.exists(file_path) or not feature.get('apiCalls'):
40+
for file_reference in source.get('files', []):
41+
file_path = os.path.join(repo_dir, file_reference.get('path', ''))
42+
if not os.path.exists(file_path) or not file_reference.get('apiCalls'):
4243
continue
43-
44-
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
45-
lines = f.readlines()
46-
4744
# Collect annotations per line
48-
line_annotations = defaultdict(set)
4945

50-
for api_call in feature['apiCalls']:
46+
for api_call in file_reference['apiCalls']:
5147
line_index = api_call.get('line', 0)
5248
feature_names = api_call.get('features', [])
5349
method_name = api_call.get('api', '').split('.')[-1]
5450

55-
if feature_names and line_index < len(lines):
56-
for feature_name in feature_names:
57-
tag = f"APIMatch|{feature_name}|{method_name}"
58-
line_annotations[line_index].add(tag)
59-
library_features.add(tag)
60-
if add_to_fm(fm, taxonomy, feature_name, tag) is None:
61-
print(f"Feature '{feature_name}' not found in taxonomy, skipped for now.")
62-
63-
# Apply annotations to lines
64-
for line_index, tags in line_annotations.items():
65-
annotation = ""
66-
if len(tags) == 1:
67-
tag = next(iter(tags))
68-
annotation = f"// &line[{tag}]"
69-
else:
70-
tags_str = ", ".join(sorted(tags))
71-
annotation = f"// &line[{tags_str}]"
72-
73-
if annotation not in lines[line_index]:
74-
lines[line_index] = lines[line_index].rstrip() + f" {annotation}\n"
75-
76-
with open(file_path, "w", encoding="utf-8") as f:
77-
f.writelines(lines)
51+
for feature_name in feature_names:
52+
tag = f"APIMatch|{feature_name}|{method_name}"
53+
54+
line_dict = line_annotations.get(file_path)
55+
if line_dict is None:
56+
line_dict = defaultdict(set)
57+
line_annotations[file_path] = line_dict
58+
59+
feature_list = line_dict.get(line_index)
60+
if feature_list is None:
61+
feature_list = set()
62+
line_dict[line_index] = feature_list
63+
64+
feature_list.add(tag)
65+
library_features.add(tag)
66+
if add_to_fm(fm, taxonomy, feature_name, tag) is None:
67+
print(f"Feature '{feature_name}' not found in taxonomy, skipped for now.")
68+
69+
for file_path, values in line_annotations.items():
70+
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
71+
lines = file.readlines()
72+
73+
# Apply annotations to lines
74+
for line_index, tags in values.items():
75+
if line_index >= len(lines):
76+
print(f"Warning: Line index {line_index} exceeds the number of lines in {file_path}. Skipping adding annotation for: {tags}.")
77+
continue
78+
new_line = createLineAnnotation(lines[line_index].rstrip(), tags)
79+
lines[line_index] = new_line
80+
81+
write(file_path, lines)
7882
return library_features
7983

8084

85+
def createLineAnnotation(line, tags):
86+
annotation = ""
87+
if len(tags) == 1:
88+
tag = next(iter(tags))
89+
annotation = f"// &line[{tag}]"
90+
else:
91+
tags_str = ", ".join(sorted(tags))
92+
annotation = f"// &line[{tags_str}]"
93+
94+
return line.rstrip() + f" {annotation}\n"
95+
96+
8197
def get_subtree(flattened_keywords, feature_name):
8298
"""Search in the flattened keywords for the given feature name and return the result."""
8399
for category, subcategory, keyword in flattened_keywords:
@@ -97,7 +113,6 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
97113
short_path = os.path.relpath(file_path, repo_dir)
98114
if "src\\" in short_path:
99115
short_path = short_path[short_path.index("src\\"):]
100-
updated_lines = [] # Store updated lines with added comments
101116
hans_lines_seen = set() # Store unique Hans line patterns
102117

103118
in_multiline_comment = False
@@ -114,12 +129,13 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
114129
multi_line_comment_end_pattern = re.compile(r"\*/") # Match */ (end of multi-line comment)
115130

116131
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
117-
for line_number, line in enumerate(file, start=1):
132+
lines = file.readlines()
133+
line_number = 0
134+
for line in lines:
118135
stripped_line = line.strip()
119136

120137
# Skip import statements
121138
if "import" in stripped_line:
122-
updated_lines.append(line)
123139
continue
124140

125141
# Skip lines that are individually annotated with HAnS
@@ -129,14 +145,12 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
129145
if match not in hans_lines_seen: # Count only if not seen before
130146
hans_exclusion_counter[0] += 1
131147
hans_lines_seen.add(match)
132-
updated_lines.append(line)
133148
continue
134149

135150
# Handle multi-line comments
136151
if multi_line_comment_start_pattern.search(stripped_line):
137152
in_multiline_comment = True
138153
if in_multiline_comment:
139-
updated_lines.append(line)
140154
if multi_line_comment_end_pattern.search(stripped_line):
141155
in_multiline_comment = False
142156
continue
@@ -153,17 +167,15 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
153167
in_hans_annotated_block = True
154168
if hans_end_pattern.search(stripped_line):
155169
in_hans_annotated_block = False
156-
updated_lines.append(line) # Preserve the closing annotation
170+
# Preserve the closing annotation
157171
continue
158172

159173
# Skip lines inside test contexts or inside a HAnS-annotated block
160174
if in_testing_context or in_hans_annotated_block:
161-
updated_lines.append(line)
162175
continue
163176

164177
# Skip single-line comments
165178
if single_line_comment_pattern.search(stripped_line):
166-
updated_lines.append(line)
167179
continue
168180

169181
# Remove all string literals from the line before searching for keywords
@@ -197,15 +209,10 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
197209
# Add begin and end comments only once for the line
198210
features, fm = determine_feature(pos_counter, matches, line_number, fm)
199211

200-
comment_start = "// &begin["+features+"]\n"
201-
updated_lines.append(comment_start)
202-
updated_lines.append(line) # Add the line with the match
203-
comment_end = "// &end["+features+"]\n"
204-
updated_lines.append(comment_end)
212+
lines[line_number] = createLineAnnotation(line, features)
205213
pos_list.append(f"Pos{pos_counter[0]}")
206214

207-
else:
208-
updated_lines.append(line)
215+
line_number += 1
209216

210217
# Consolidate the "Keywords Found" for each line
211218
for match in matches.values():
@@ -216,14 +223,18 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
216223
match["Keywords Found"] = ", ".join(consolidated_keywords)
217224

218225
# Save updated file with comments
219-
with open(file_path, "w", encoding="utf-8") as file:
220-
file.writelines(updated_lines)
226+
if len(matches.values()) > 0:
227+
write(file_path, lines)
221228

222229
return os.path.basename(file_path), short_path, list(matches.values())
223230

231+
def write(file_path, lines):
232+
with open(file_path, "w", encoding="utf-8") as file:
233+
file.writelines(lines)
234+
224235

225236
def determine_feature(pos_counter, matches, line_number, fm):
226-
features = ''
237+
features = []
227238
for match in list(matches[line_number]["Keywords Found"].items()):
228239
if len(features) > 0:
229240
features += ', '
@@ -240,7 +251,7 @@ def determine_feature(pos_counter, matches, line_number, fm):
240251
)
241252

242253
feature_name = f'KeywordMatch|{path[length - 1]}|{value}'
243-
features += feature_name
254+
features.append(feature_name)
244255

245256
current = fm
246257
i = 0
@@ -341,7 +352,7 @@ def main():
341352
fm = Feature(taxonomy.name, None)
342353

343354
# Process library annotations first
344-
library_features = process_feature_annotations(project_dir+"/result/features.json", project_dir, flattened_keywords, taxonomy, fm)
355+
library_features = add_api_feature_annotations(project_dir+"/result/features.json", project_dir, flattened_keywords, taxonomy, fm)
345356

346357
# Initialize the exclusion counter ONCE here
347358
hans_exclusion_counter = [0]

0 commit comments

Comments
 (0)