From ba5f07c65c07cf747f3e62b5f0d43df3b773a0d1 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Mon, 25 Aug 2025 13:35:24 +0200 Subject: [PATCH 01/32] Add commit author of 'commit_added' events to event info This allows for reconstruction of correct commit author if user is github Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 8 ++++++-- issue_processing/issue_processing.py | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 2b54ef7..cea6efd 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -181,7 +181,7 @@ def is_github_noreply_author(name, email): commit_data_file = path.join(data_path, commits_list) commit_data = csv_writer.read_from_csv(commit_data_file) commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data} - + author_name_to_data = {author[1]: author[1:3] for author in author_data_new} issue_data_new = [] for event in issue_data: @@ -189,12 +189,16 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] - + name = event[13][1:-1] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: event[9] = commit_hash_to_author[commit_hash][0] event[10] = commit_hash_to_author[commit_hash][1] issue_data_new.append(event) + elif name in author_name_to_data: + event[9] = author_name_to_data[name][0] + event[10] = author_name_to_data[name][1] + issue_data_new.append(event) else: # the added commit is not part of the commit data. In most cases, this is due to merge commits # appearing in another pull request, as Codeface does not keep track of merge commits. As we diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 3db14d5..73d48e3 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -364,6 +364,7 @@ def merge_issue_events(issue_data): # it is a commit which was added to the pull request if rel_commit["type"] == "commitAddedToPullRequest": rel_commit["event"] = "commit_added" + rel_commit["event_info_2"] = rel_commit["commit"]["author"] # if the related commit was mentioned in an issue comment: elif rel_commit["type"] == "commitMentionedInIssue": @@ -750,6 +751,9 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_id_and_update_user(event["user"]) + if event["event"] == "commit_added": + event["event_info_2"] = get_id_and_update_user(event["event_info_2"]) + # check database for the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_id_and_update_user(event["ref_target"]) @@ -763,6 +767,10 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_user_from_id(event["user"]) + # for commit_added events, save the commit's author's name in event_info_2 + if event["event"] == "commit_added": + event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"] + # get the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_user_from_id(event["ref_target"]) From c40df30e919b3877aea11058c2fa917e34443057 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 26 Aug 2025 10:56:56 +0200 Subject: [PATCH 02/32] Update Copyright headers also added one comment for clarity Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 2 ++ issue_processing/issue_processing.py | 1 + 2 files changed, 3 insertions(+) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index cea6efd..a7a5488 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,6 +14,7 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock +# Copyright 2025 by Leo Sendelbach # Copyright 2026 by Thomas Bock # Copyright 2025 by Maximilian Löffler # All Rights Reserved. @@ -189,6 +190,7 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] + # extract author name from event info 2 while cutting excess '"' name = event[13][1:-1] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 73d48e3..53c8313 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,6 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock +# Copyright 2025 by Leo Sendelbach # Copyright 2026 by Thomas Bock # Copyright 2025 by Maximilian Löffler # All Rights Reserved. From eb1849b75cea1f4e17746f4f18100f1f393c4eef Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Thu, 25 Sep 2025 14:37:37 +0200 Subject: [PATCH 03/32] Add connected events reconstruction also save merge commits reconstruction of connected events is done by first saving all connected events that occured at the same time. Then, it is possible to match connected events iff: - half of the involved issues are equal, meaning that one issue is connected to multiple others - half rounded up of the involved isses are equal, meaning that we have one external connected event and then the previous case with the remaining issues Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 97 ++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 53c8313..a09b765 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -31,6 +31,7 @@ import os import sys from datetime import datetime, timedelta +import math from logging import getLogger from codeface_utils.cluster.idManager import dbIdManager, csvIdManager @@ -56,6 +57,9 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" +filtered_connected_events = dict() +external_connected_events = dict() + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -297,6 +301,7 @@ def merge_issue_events(issue_data): log.info("Merge issue events ...") issue_data_to_update = dict() + connected_events = dict() for issue in issue_data: @@ -493,6 +498,28 @@ def merge_issue_events(issue_data): event["ref_target"] = event["user"] event["user"] = event["assigner"] + # if event is merged event, save the hash of the merge commit in event_info_1 + if event["event"] == "merged": + event["event_info_1"] = event["commit"]["hash"] + + # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction + if event["event"] == "connected": + if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]: + connected_events[event["created_at"]]["issues"].append(issue["number"]) + elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \ + and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: + connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) + elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \ + and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: + connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], -1) + else: + connected_info = dict() + connected_info["issues"] = [issue["number"]] + connected_info["user"] = issue["user"] + connected_events[event["created_at"]] = connected_info + # merge events, relatedCommits, relatedIssues and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[ "relatedCommits"] + issue["reviewsList"] @@ -504,6 +531,10 @@ def merge_issue_events(issue_data): # sorts eventsList by time issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) + # filter out connected events which cannot be perfectly matched + global filtered_connected_events + filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1]), connected_events.iteritems())) + # updates all the issues by the temporarily stored referenced_by events for _, value in issue_data_to_update.items(): for issue in issue_data: @@ -513,6 +544,41 @@ def merge_issue_events(issue_data): return issue_data +def filter_connected_events(key, value): + num_issues = len(value["issues"]) + global external_connected_events + # if only a single connected event exists at this time, it has to be connecting to an external issue + if num_issues == 1: + external_connected_events[key] = value + return False + # if 2 connected events exist, matching them is trivial + if num_issues == 2: + return True + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + # otherwise, if it is an even number, check if it can be easily matched, + # meaning that exactly half the events occur in the same issue + if num_issues % 2 == 0 and num_issues/2 in occurances.values(): + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # if it is an odd number, check if it can be easily matched + # meaning that exactly half (rounded up) the events occur in the same issue + if num_issues % 2 == 1 and math.ceil(num_issues/2) in occurances.values(): + for sub_key, sub_value in occurances.iteritems(): + # then, assign one of them as an external connected event and proceed as in previous case + if sub_value == math.ceil(num_issues/2): + new_entry = dict() + new_entry["user"] = value["user"] + new_entry["issues"] = [sub_key] + external_connected_events[key] = new_entry + value["issues"].remove(sub_key) + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # no other variants can be easily matched + return False + + def reformat_events(issue_data): """ Re-format event information dependent on the event type. @@ -543,6 +609,37 @@ def reformat_events(issue_data): if event["ref_target"] is not None and not event["ref_target"] == "": users = update_user_dict(users, event["ref_target"]) + # reconstruction of connections + if event["event"] == "connected": + external = False + # check if event is external + for key, value in external_connected_events.iteritems(): + if issue["number"] in value["issues"]: + if key == event["created_at"]: + external = True + event["event_info_1"] = "external" + value["issues"].remove(issue["number"]) + # if so, skip the next checks + if external: + continue + # otherwise, it must be internal + for key, value in filtered_connected_events.iteritems(): + if issue["number"] in value["issues"]: + if key == event["created_at"]: + if len(value["issues"]) == 2: + # if only 2 events occured at this timestamp, matching the issues is trivial + event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] + else: + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurances[issue["number"]] == max(occurances.values()): + # otherwise, if current issue is the centerpiece of all connected events, use previous copy to match issues + number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) + value["multi_issues_copy"].remove(number) + event["event_info_1"] = number + else: + # if current issue is not the centerpiece, connect it to the centerpiece + event["event_info_1"] = max(occurances, key = occurances.get) + # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: From dd3f1516a35adc680195336fe5ca3190662b250d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 14:43:33 +0200 Subject: [PATCH 04/32] Remove unnecessary returns of issue data since data is modified in-place, return of input data is not needed Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index a09b765..64e4253 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -82,13 +82,13 @@ def run(): # 1) load the list of issues issues = load(__srcdir) # 2) re-format the issues - issues = reformat_issues(issues) + reformat_issues(issues) # 3) merges all issue events into one list - issues = merge_issue_events(issues) + merge_issue_events(issues) # 4) re-format the eventsList of the issues - issues = reformat_events(issues) + reformat_events(issues) # 5) update user data with Codeface database and dump username-to-name/e-mail list - issues = insert_user_data(issues, __conf, __resdir) + insert_user_data(issues, __conf, __resdir) # 6) dump result to disk print_to_disk(issues, __resdir) @@ -287,7 +287,7 @@ def reformat_issues(issue_data): else: issue["type"].append("issue") - return issue_data + return def merge_issue_events(issue_data): @@ -541,7 +541,7 @@ def merge_issue_events(issue_data): if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] - return issue_data + return def filter_connected_events(key, value): @@ -750,7 +750,7 @@ def reformat_events(issue_data): for event_to_remove in events_to_remove: issue["eventsList"].remove(event_to_remove) - return issue_data + return def insert_user_data(issues, conf, resdir): @@ -889,7 +889,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): username_dump = os.path.join(resdir, "usernames.list") csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0])) - return issues + return def print_to_disk(issues, results_folder): From 62ebd6dd5d1fe850fedcd94ff398b5d3587f416d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 14:47:19 +0200 Subject: [PATCH 05/32] Add reasons to reopen/closed events ALso add commit hash if closed by commit Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 64e4253..0af1306 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -657,13 +657,16 @@ def reformat_events(issue_data): if event["event"] == "closed": event["event"] = "state_updated" event["event_info_1"] = "closed" # new state - event["event_info_2"] = "open" # old state + if event["commit"] is not None: + event["event_info_2"] = event["commit"]["hash"] + else: + event["event_info_2"] = event["state_reason"] issue["state_new"] = "closed" elif event["event"] == "reopened": event["event"] = "state_updated" event["event_info_1"] = "open" # new state - event["event_info_2"] = "closed" # old state + event["event_info_2"] = event["state_reason"] issue["state_new"] = "reopened" elif event["event"] == "labeled": From 51eee0e177bc456e3b98858425054b1223f8a33f Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 17:01:48 +0200 Subject: [PATCH 06/32] Add GitHub issue types also rename 'new feature' to 'feature' Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 7 +++++-- issue_processing/jira_issue_processing.py | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 0af1306..720ffc2 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -46,7 +46,7 @@ log = getLogger(__name__) # known types from JIRA and GitHub default labels -known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"} +known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"} # known resolutions from JIRA and GitHub default labels known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce", @@ -246,7 +246,10 @@ def reformat_issues(issue_data): for issue in issue_data: # empty container for issue types - issue["type"] = [] + if issue["type"] is None: + issue["type"] = [] + else: + issue["type"] = [issue["type"]["name"].lower()] # empty container for issue resolutions issue["resolution"] = [] diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 4220b96..ee3ae62 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -303,9 +303,12 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): link = issue_x.getElementsByTagName("link")[0] issue["url"] = link.firstChild.data - type = issue_x.getElementsByTagName("type")[0] - issue["type"] = type.firstChild.data - issue["type_list"] = ["issue", str(type.firstChild.data.lower())] + type = issue_x.getElementsByTagName("type")[0].firstChild.data + # rename 'new feature' type to 'feature' to be in line with the github original issue type + if type == "New Feature": + type = "Feature" + issue["type"] = type + issue["type_list"] = ["issue", str(type.lower())] status = issue_x.getElementsByTagName("status")[0] issue["state"] = status.firstChild.data From 1ef9df845f1ba58ed14707911cec85747498282d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 17:26:55 +0200 Subject: [PATCH 07/32] Simplify loops for reconstruction of connections also remove duplicates from type list Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 67 +++++++++++----------------- 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 720ffc2..d35ba47 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -57,9 +57,6 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" -filtered_connected_events = dict() -external_connected_events = dict() - def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -84,9 +81,10 @@ def run(): # 2) re-format the issues reformat_issues(issues) # 3) merges all issue events into one list - merge_issue_events(issues) + external_connected_events = dict() + filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues - reformat_events(issues) + reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list insert_user_data(issues, __conf, __resdir) # 6) dump result to disk @@ -293,7 +291,7 @@ def reformat_issues(issue_data): return -def merge_issue_events(issue_data): +def merge_issue_events(issue_data, external_connected_events): """ All issue events are merged together in the eventsList. This simplifies processing in later steps. @@ -535,8 +533,7 @@ def merge_issue_events(issue_data): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # filter out connected events which cannot be perfectly matched - global filtered_connected_events - filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1]), connected_events.iteritems())) + filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items())) # updates all the issues by the temporarily stored referenced_by events for _, value in issue_data_to_update.items(): @@ -544,12 +541,11 @@ def merge_issue_events(issue_data): if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] - return + return filtered_connected_events -def filter_connected_events(key, value): +def filter_connected_events(key, value, external_connected_events): num_issues = len(value["issues"]) - global external_connected_events # if only a single connected event exists at this time, it has to be connecting to an external issue if num_issues == 1: external_connected_events[key] = value @@ -582,7 +578,7 @@ def filter_connected_events(key, value): return False -def reformat_events(issue_data): +def reformat_events(issue_data, filtered_connected_events, external_connected_events): """ Re-format event information dependent on the event type. @@ -614,34 +610,23 @@ def reformat_events(issue_data): # reconstruction of connections if event["event"] == "connected": - external = False - # check if event is external - for key, value in external_connected_events.iteritems(): - if issue["number"] in value["issues"]: - if key == event["created_at"]: - external = True - event["event_info_1"] = "external" - value["issues"].remove(issue["number"]) - # if so, skip the next checks - if external: - continue - # otherwise, it must be internal - for key, value in filtered_connected_events.iteritems(): - if issue["number"] in value["issues"]: - if key == event["created_at"]: - if len(value["issues"]) == 2: - # if only 2 events occured at this timestamp, matching the issues is trivial - event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] - else: - occurances = {x: value["issues"].count(x) for x in set(value["issues"])} - if occurances[issue["number"]] == max(occurances.values()): - # otherwise, if current issue is the centerpiece of all connected events, use previous copy to match issues - number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) - value["multi_issues_copy"].remove(number) - event["event_info_1"] = number - else: - # if current issue is not the centerpiece, connect it to the centerpiece - event["event_info_1"] = max(occurances, key = occurances.get) + if event["created_at"] in external_connected_events \ + and issue["number"] in external_connected_events[event["created_at"]]["issues"]: + event["event_info_1"] = "external" + external_connected_events[event["created_at"]]["issues"].remove(issue["number"]) + elif event["created_at"] in filtered_connected_events \ + and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]: + value = filtered_connected_events[event["created_at"]] + if len(value["issues"]) == 2: + event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] + else: + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurances[issue["number"]] == max(occurances.values()): + number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) + value["multi_issues_copy"].remove(number) + event["event_info_1"] = number + else: + event["event_info_1"] = max(occurances, key = occurances.get) # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: @@ -677,7 +662,7 @@ def reformat_events(issue_data): event["event_info_1"] = label # if the label is in this list, it also is a type of the issue - if label in known_types: + if label in known_types and label not in issue["type"]: issue["type"].append(str(label)) # creates an event for type updates and adds it to the eventsList From 5632b9d837f7df3d390ee23d905b768844a5d60e Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 17:28:23 +0200 Subject: [PATCH 08/32] Add subissues to results csv using empty line reserved for jira components Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index d35ba47..3cfe110 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -907,7 +907,7 @@ def print_to_disk(issues, results_folder): json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], - json.dumps([]), # components + json.dumps([issue["subIssues"]]), # components event["event"], event["user"]["name"], event["user"]["email"], From 4690c68e9f79cfab30ef59c972e7c282caeb5113 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 21 Oct 2025 14:10:18 +0200 Subject: [PATCH 09/32] Remove unneccesary return value also added copyright header Signed-off-by: Leo Sendelbach --- issue_processing/jira_issue_processing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index ee3ae62..a431ffa 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -19,6 +19,7 @@ # Copyright 2020-2021 by Thomas Bock # Copyright 2026 by Thomas Bock # Copyright 2023, 2025 by Maximilian Löffler +# Copyright 2025 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. @@ -128,7 +129,7 @@ def run(): referenced_issue["history"].append(referenced_by) # 5) update user data with Codeface database - processed_issues = insert_user_data(processed_issues, __conf) + insert_user_data(processed_issues, __conf) # 6) dump result to disk print_to_disk(processed_issues, __resdir) # # 7) export for Gephi @@ -695,7 +696,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): event["event_info_2"] = assigned_user["email"] log.debug("number of issues after insert_user_data: '{}'".format(len(issues))) - return issues + return def print_to_disk(issues, results_folder): From f1e93d3f40416184ff2cfa61c88bc787076406b2 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 21 Oct 2025 14:11:29 +0200 Subject: [PATCH 10/32] Add comments also minor fixes and removal of math.ceil Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 36 +++++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 3cfe110..51f3ce0 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -80,10 +80,15 @@ def run(): issues = load(__srcdir) # 2) re-format the issues reformat_issues(issues) - # 3) merges all issue events into one list + # create an empty dict for external connected events, meaning connected + # events that connect to an issue in another repository external_connected_events = dict() + # 3) merges all issue events into one list + # this step returns a dict containing all connected events that can be matched to the correct issues later filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues + # this step also reconstructs the connections previously stored + # in 'external_connected_events' and 'filtered_connected_events' reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list insert_user_data(issues, __conf, __resdir) @@ -506,16 +511,20 @@ def merge_issue_events(issue_data, external_connected_events): # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction if event["event"] == "connected": if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]: + # if there is already a connected event at this time by this user, add this event to the list connected_events[event["created_at"]]["issues"].append(issue["number"]) elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \ and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: + # same as above, but accounting for a possible difference in timestamps of 1 second between matching events connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \ and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: + # same as above, with offset calculated in the other direction connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) event["created_at"] = subtract_seconds_from_time(event["created_at"], -1) else: + # if there is no connected event yet at this timestamp, create a new entry for this event connected_info = dict() connected_info["issues"] = [issue["number"]] connected_info["user"] = issue["user"] @@ -553,19 +562,19 @@ def filter_connected_events(key, value, external_connected_events): # if 2 connected events exist, matching them is trivial if num_issues == 2: return True - occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + occurences = {x: value["issues"].count(x) for x in set(value["issues"])} # otherwise, if it is an even number, check if it can be easily matched, # meaning that exactly half the events occur in the same issue - if num_issues % 2 == 0 and num_issues/2 in occurances.values(): + if num_issues % 2 == 0 and num_issues/2 in occurences.values(): # duplicate issue list for matching the issues later value["multi_issues_copy"] = list(value["issues"]) return True # if it is an odd number, check if it can be easily matched # meaning that exactly half (rounded up) the events occur in the same issue - if num_issues % 2 == 1 and math.ceil(num_issues/2) in occurances.values(): - for sub_key, sub_value in occurances.iteritems(): + if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values(): + for sub_key, sub_value in occurences.iteritems(): # then, assign one of them as an external connected event and proceed as in previous case - if sub_value == math.ceil(num_issues/2): + if sub_value == (num_issues + 1)/2: new_entry = dict() new_entry["user"] = value["user"] new_entry["issues"] = [sub_key] @@ -612,21 +621,30 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev if event["event"] == "connected": if event["created_at"] in external_connected_events \ and issue["number"] in external_connected_events[event["created_at"]]["issues"]: + # if the event is an external connected event, mark it as such and remove this issue from the list event["event_info_1"] = "external" external_connected_events[event["created_at"]]["issues"].remove(issue["number"]) elif event["created_at"] in filtered_connected_events \ and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]: + # if it is instead an internal connected event value = filtered_connected_events[event["created_at"]] if len(value["issues"]) == 2: + # and we only have 2 issues in the list, connect to the other issue event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] else: - occurances = {x: value["issues"].count(x) for x in set(value["issues"])} - if occurances[issue["number"]] == max(occurances.values()): + # and we have more than two issues, count each issue's occurences + occurences = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurences[issue["number"]] == max(occurences.values()): + # if our issue is the most common one, that means it is the common denominator + # for all connected events at this time + # so this event connects to any other issue + # which is then removed from a copied list to avoid duplications number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) value["multi_issues_copy"].remove(number) event["event_info_1"] = number else: - event["event_info_1"] = max(occurances, key = occurances.get) + # otherwise, connect this event to the common denominator + event["event_info_1"] = max(occurences, key=occurences.get) # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: From 8351b311d55d3ab25acc8fb50a8ae19cae155b5f Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 31 Oct 2025 16:28:42 +0100 Subject: [PATCH 11/32] Add new json field for suggestions to result comments now each have a boolean field that describes whether the comment contains a suggestion or not Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 51f3ce0..4e5a901 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -745,7 +745,10 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev # "state_new" and "resolution" of the issue give the information about the state and the resolution of # the issue when the comment was written, because the eventsList is sorted by time event["event_info_1"] = issue["state_new"] - event["event_info_2"] = issue["resolution"] + if "contains_suggestion" in event: + event["event_info_2"] = event["contains_suggestion"] + else: + event["event_info_2"] = False elif event["event"] == "referenced" and event["commit"] is not None: # remove "referenced" events originating from commits From fa67649ce4557bd92b1bedae261dfa5249926428 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 31 Oct 2025 16:36:11 +0100 Subject: [PATCH 12/32] Improve documentation dicts for reconstructing connected events are now better explained and the comments do not disruot the workflow in the run function anymore Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 4e5a901..bc64c3b 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -80,15 +80,10 @@ def run(): issues = load(__srcdir) # 2) re-format the issues reformat_issues(issues) - # create an empty dict for external connected events, meaning connected - # events that connect to an issue in another repository - external_connected_events = dict() # 3) merges all issue events into one list - # this step returns a dict containing all connected events that can be matched to the correct issues later + external_connected_events = dict() filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues - # this step also reconstructs the connections previously stored - # in 'external_connected_events' and 'filtered_connected_events' reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list insert_user_data(issues, __conf, __resdir) @@ -542,6 +537,8 @@ def merge_issue_events(issue_data, external_connected_events): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # filter out connected events which cannot be perfectly matched + # and populate external_connected_events dict + # because this happens in place, we do not need to return the external_connected_event dict later filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items())) # updates all the issues by the temporarily stored referenced_by events @@ -550,6 +547,7 @@ def merge_issue_events(issue_data, external_connected_events): if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] + # return the filtered_connected_events dict for later reconstruction return filtered_connected_events From a9eed8abd536dd8feb917e3d062e7260d0c7c1df Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 4 Nov 2025 12:46:51 +0100 Subject: [PATCH 13/32] Incorporate requested changes includes: - updated comments - spelling mistake - fix for potential crash if script is used on old data Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index bc64c3b..1dbddea 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -264,7 +264,7 @@ def reformat_issues(issue_data): if issue["relatedCommits"] is None: issue["relatedCommits"] = [] - # if an issue has no reviewsList, an empty Listgets created + # if an issue has no reviewsList, an empty List gets created if issue["reviewsList"] is None: issue["reviewsList"] = [] @@ -272,6 +272,10 @@ def reformat_issues(issue_data): if "relatedIssues" not in issue: issue["relatedIssues"] = [] + # if an issue has no sub-issue list, an empty List gets created + if "subIssues" not in issue: + issue["subIssues"] = [] + # add "closed_at" information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" @@ -740,9 +744,10 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev issue["eventsList"].append(resolution_event) elif event["event"] == "commented": - # "state_new" and "resolution" of the issue give the information about the state and the resolution of + # "state_new" of the issue gives the information about the state of # the issue when the comment was written, because the eventsList is sorted by time event["event_info_1"] = issue["state_new"] + # if event is a review comment, it can contain suggestions if "contains_suggestion" in event: event["event_info_2"] = event["contains_suggestion"] else: From 8066db9932d9632e63ca5b062ce1ebf7996674b2 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 13:41:27 +0100 Subject: [PATCH 14/32] Add copilot user unification to author postprocessing author postprocessing now also contains a list of known copilot use names that can be extended to unify more different copilot users Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index a7a5488..ac83b69 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,7 +14,7 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock -# Copyright 2025 by Leo Sendelbach +# Copyright 2025-2026 by Leo Sendelbach # Copyright 2026 by Thomas Bock # Copyright 2025 by Maximilian Löffler # All Rights Reserved. @@ -54,6 +54,15 @@ setup_logging() log = getLogger(__name__) +## +# GLOBAL VARIABLES +## + +# global variable containing all known copilot users and the name and mail adress copilot users will be assigned +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +copilot_unified_name = "Copilot" +copilot_unified_email = "copilot@example.com" + ## # RUN POSTPROCESSING ## @@ -82,7 +91,7 @@ def perform_data_backup(results_path, results_path_backup): copy(current_file, backup_file) -def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list): +def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True): """ Replace the author "GitHub " in both commit and GitHub issue data by the correct author. The author "GitHub " is automatically inserted as the committer of a commit that is made when @@ -186,6 +195,11 @@ def is_github_noreply_author(name, email): issue_data_new = [] for event in issue_data: + # unify events to use a single copilot user for all events triggered by a known copilot user + if unify_copilot_users and event[9] in known_copilot_users: + event[9] = copilot_unified_name + event[10] = copilot_unified_email + # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 From eb78dbaf6d78a7ea137401abbf47bbad7637907a Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 13:48:43 +0100 Subject: [PATCH 15/32] Assign copilot user data in case of specific events the events 'copilot_work_started' and 'copilot_work_finished' now always have the standard copilot user data Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 1dbddea..b33c88a 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,7 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock -# Copyright 2025 by Leo Sendelbach +# Copyright 2025-2026 by Leo Sendelbach # Copyright 2026 by Thomas Bock # Copyright 2025 by Maximilian Löffler # All Rights Reserved. @@ -57,6 +57,9 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" +# Copilot username to be assigned in specific copilot events +copilot_username = "Copilot" + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -491,6 +494,12 @@ def merge_issue_events(issue_data, external_connected_events): if event["event"] == "review_requested" or event["event"] == "review_request_removed": event["ref_target"] = event["requestedReviewer"] + # if event is a specific copilot event, assign the copilot user data + if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished": + event["user"]["name"] = None + event["user"]["username"] = copilot_username + event["user"]["email"] = "" + # if event dismisses a review, we can determine the original state of the corresponding review if event["event"] == "review_dismissed": for review in issue["reviewsList"]: From a0ebc1437e04a5ee1bf1f2f6d8a7a967f4b6b0f5 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 15:27:23 +0100 Subject: [PATCH 16/32] Add documentation for new copilot user unification Method doc updated to reflect new functionality Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index ac83b69..d7fcc6b 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -102,7 +102,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth "GitHub " are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which reference the author "GitHub " are removed from the GitHub issue data. In addition, remove the author "GitHub " also from the author data and bot data and remove e-mails that have been sent - by this author. + by this author. This method also unifies all known copilot users into a single user if desired. :param data_path: the path to the project data that is to be fixed :param issues_github_list: file name of the github issue data @@ -110,6 +110,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth :param authors_list: file name of the corresponding author data :param emails_list: file name of the corresponding email data :param bots_list: file name of the corresponding bot data + :param unify_copilot_users: whether to unify known copilot users into a single user """ github_user = "GitHub" github_email = "noreply@github.com" From e496e66ae8619bdbe3d2c091d03f0fbc3d9271f1 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 15:48:39 +0100 Subject: [PATCH 17/32] Fix connected event assignment previously, the creator of the issues was falsely matched to the connected event instead of the user triggering the event Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index b33c88a..bd1c191 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -535,7 +535,7 @@ def merge_issue_events(issue_data, external_connected_events): # if there is no connected event yet at this timestamp, create a new entry for this event connected_info = dict() connected_info["issues"] = [issue["number"]] - connected_info["user"] = issue["user"] + connected_info["user"] = event["user"] connected_events[event["created_at"]] = connected_info # merge events, relatedCommits, relatedIssues and comment lists From 53e3b0f29e3163bd42afdf5d2703d114e03b09a1 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Wed, 18 Feb 2026 14:06:28 +0100 Subject: [PATCH 18/32] Unify copilot users in all files unification now done on all files, which should prevent any issues arising from unknown authors during anonymization also move all global variables to a new utils file Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 69 +++++++++++-------- github_user_utils/github_user_utils.py | 54 +++++++++++++++ issue_processing/issue_processing.py | 5 +- 3 files changed, 95 insertions(+), 33 deletions(-) create mode 100644 github_user_utils/github_user_utils.py diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index d7fcc6b..e908a2c 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -53,15 +53,10 @@ # create logger setup_logging() log = getLogger(__name__) +from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ + is_github_noreply_author, github_user, github_email, \ + commit_added_event, mentioned_event, subscribed_event -## -# GLOBAL VARIABLES -## - -# global variable containing all known copilot users and the name and mail adress copilot users will be assigned -known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} -copilot_unified_name = "Copilot" -copilot_unified_email = "copilot@example.com" ## # RUN POSTPROCESSING @@ -112,25 +107,6 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth :param bots_list: file name of the corresponding bot data :param unify_copilot_users: whether to unify known copilot users into a single user """ - github_user = "GitHub" - github_email = "noreply@github.com" - commit_added_event = "commit_added" - mentioned_event = "mentioned" - subscribed_event = "subscribed" - - """ - Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub ". - There are two options in Codeface how this can happen: - (1) Username is "GitHub" and e-mail address is "noreply@github.com" - (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com" - - :param name: the name of the author to be checked - :param email: the email address of the author to be checked - :return: whether the given (name, email) pair belongs to the "GitHub " author - """ - def is_github_noreply_author(name, email): - return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) - # Check for all files in the result directory of the project whether they need to be adjusted for filepath, _, filenames in walk(data_path): @@ -139,20 +115,32 @@ def is_github_noreply_author(name, email): if authors_list in filenames: f = path.join(filepath, authors_list) log.info("Remove author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) author_data = csv_writer.read_from_csv(f) author_data_new = [] - + copilot_user_added = False for author in author_data: # keep author entry only if it should not be removed if not is_github_noreply_author(author[1], author[2]): - author_data_new.append(author) + # unify copilot author if desired + if unify_copilot_users and author[1] in known_copilot_users: + if not copilot_user_added: + author[1] = copilot_unified_name + author[2] = copilot_unified_email + copilot_user_added = True + author_data_new.append(author) + else: + author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) # (2) Remove e-mails from author 'GitHub ' from all emails.list files if emails_list in filenames: f = path.join(filepath, emails_list) log.info("Remove emails from author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) email_data = csv_writer.read_from_csv(f) email_data_new = [] @@ -160,6 +148,10 @@ def is_github_noreply_author(name, email): for email in email_data: # keep author entry only if it should not be removed if not is_github_noreply_author(email[0], email[1]): + # unify copilot users if desired + if unify_copilot_users and email[0] in known_copilot_users: + email[0] = copilot_unified_name + email[1] = copilot_unified_email email_data_new.append(email) else: log.warning("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) @@ -170,6 +162,8 @@ def is_github_noreply_author(name, email): if commits_list in filenames: f = path.join(filepath, commits_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) commit_data = csv_writer.read_from_csv(f) for commit in commit_data: @@ -178,6 +172,10 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(commit[5], commit[6]): commit[5] = commit[2] commit[6] = commit[3] + # unify copilot author if desired + if unify_copilot_users and commit[5] in known_copilot_users: + commit[5] = copilot_unified_name + commit[6] = copilot_unified_email csv_writer.write_to_csv(f, commit_data) @@ -186,6 +184,8 @@ def is_github_noreply_author(name, email): if issues_github_list in filenames: f = path.join(filepath, issues_github_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) issue_data = csv_writer.read_from_csv(f) # read commit data @@ -200,7 +200,13 @@ def is_github_noreply_author(name, email): if unify_copilot_users and event[9] in known_copilot_users: event[9] = copilot_unified_name event[10] = copilot_unified_email - + if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users: + # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user + event[13] = '"' + copilot_unified_name + '"' + elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users: + # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user + event[12] = '"' + copilot_unified_name + '"' + event[13] = '"' + copilot_unified_email + '"' # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 @@ -380,6 +386,9 @@ def run_postprocessing(conf, resdir, backup_data): if person[4] == issue_event[12] and (quot_m + person[5] + quot_m) == issue_event[13]: issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m + # replace name in event info 2 if necessary + if person[4] == issue_event[13]: + issue_event[13] = person[1] csv_writer.write_to_csv(f, issue_data) diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py new file mode 100644 index 0000000..20a3aa3 --- /dev/null +++ b/github_user_utils/github_user_utils.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Copyright 2026 by Leo Sendelbach +# All Rights Reserved. +""" +This file serves as a collection of global variables and utility functions, which are used throughout the +issue data extraction and post-processing, in particular for the processing of GitHub and Copilot user data. +""" + +## +# GLOBAL VARIABLES +## + +# global variables containing all known copilot users and the name and mail adress copilot users will be assigned +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +copilot_unified_name = "Copilot" +copilot_unified_email = "copilot@example.com" + +## global variables for the GitHub author +github_user = "GitHub" +github_email = "noreply@github.com" +commit_added_event = "commit_added" +mentioned_event = "mentioned" +subscribed_event = "subscribed" + +## +# UTILITY FUNCTIONS +## + +def is_github_noreply_author(name, email): + """ + Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub ". + There are two options in Codeface how this can happen: + (1) Username is "GitHub" and e-mail address is "noreply@github.com" + (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com" + + :param name: the name of the author to be checked + :param email: the email address of the author to be checked + :return: whether the given (name, email) pair belongs to the "GitHub " author + """ + + return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) \ No newline at end of file diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index bd1c191..f1ee54c 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -40,6 +40,7 @@ from dateutil import parser as dateparser from csv_writer import csv_writer +from github_user_utils import copilot_unified_name # create logger setup_logging() @@ -57,8 +58,6 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" -# Copilot username to be assigned in specific copilot events -copilot_username = "Copilot" def run(): # get all needed paths and arguments for the method call. @@ -497,7 +496,7 @@ def merge_issue_events(issue_data, external_connected_events): # if event is a specific copilot event, assign the copilot user data if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished": event["user"]["name"] = None - event["user"]["username"] = copilot_username + event["user"]["username"] = copilot_unified_name event["user"]["email"] = "" # if event dismisses a review, we can determine the original state of the corresponding review From 17f7da74225ffcb22c50cbd27d9065139f28a122 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Wed, 18 Feb 2026 14:14:11 +0100 Subject: [PATCH 19/32] Add support for 'known agents' Known agentsc such as 'copilot' or 'claude' can now be read, similar to known bots. They will be flagged as agents during bot processing. Signed-off-by: Leo Sendelbach --- bot_processing/bot_processing.py | 34 ++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 9b18dd4..8c7df78 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -13,6 +13,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Copyright 2021-2022 by Thomas Bock +# Copyright 2026 by Leo Sendelbach # Copyright 2026 by Thomas Bock # Copyright 2025 by Maximilian Löffler # All Rights Reserved. @@ -54,6 +55,7 @@ def run(): # (the known bots file is the file in which known bots have been added manually and project independent) __confdir = os.path.join(args.resdir, os.path.dirname(args.config)) __known_bots_file = os.path.abspath(os.path.join(__confdir, "known_github_bots.list")) + __known_agents_file = os.path.abspath(os.path.join(__confdir, "known_github_agents.list")) # run processing of bot data: # 1) load bot data @@ -61,7 +63,7 @@ def run(): # 2) load user data users = load_user_data(os.path.join(__resdir, "usernames.list")) # 3) update bot data with user data and additionally add known bots if they occur in the project - bots = add_user_data(bots, users, __known_bots_file) + bots = add_user_data(bots, users, __known_bots_file, __known_agents_file) # 4) dump result to disk print_to_disk(bots, __resdir) @@ -113,12 +115,13 @@ def load_user_data(user_data_file): return user_data -def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_reduced): +def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_data, bot_data_reduced): """ Check whether there are known bots occurring in the project. If so, add them to the bots list or update the bots list accordingly. :param known_bots_file: the file path to the list of known bot data + :param known_agents_file: the file path to the list of known agent data :param bot_data: the bot data originating from the bot prediction :param user_data: a dictionary from the issue data which maps GitHub usernames to authors :param bot_data_reduced: the bot data after mapping GitHub user names to authors @@ -128,6 +131,7 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red # Read the list of known bots known_bots = load_bot_data(known_bots_file, header = False) + known_agents = load_bot_data(known_agents_file, header = False) # Get the GitHub usernames of the bots predicted to be a bot predicted_bots = [bot[0] if len(bot) > 0 else "" for bot in bot_data] @@ -154,11 +158,33 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]])) break + for agent in known_agents: + + # (1) check if a known agent occurs in the GitHub issue data but has not been predicted + if agent[0] not in predicted_bots and agent[0] in user_data: + + # add the known agent as a bot to the bots list + additional_agent = dict() + additional_agent["user"] = user_data[agent[0]] + additional_agent["prediction"] = "Agent" + bot_data_reduced.append(additional_agent) + log.info("Add known agent '{}' to bot data.".format(additional_agent["user"])) + + # (2) handle known agents that are already present in the bots list + elif agent[0] in predicted_bots and agent[0] in user_data: + + # make sure that this bot has also been predicited to be an agent + for predicted_bot in bot_data_reduced: + if predicted_bot["user"] == user_data[agent[0]]: + predicted_bot["prediction"] = "Agent" + log.info("Mark user '{}' as agent in the bot data.".format(user_data[agent[0]])) + break + # return the updated bot data return bot_data_reduced -def add_user_data(bot_data, user_data, known_bots_file): +def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): """ Add user data to bot data, i.e., replace username by name and e-mail. In addition, check in the global bots list whether there are authors in the projects which are @@ -202,7 +228,7 @@ def add_user_data(bot_data, user_data, known_bots_file): log.warning("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly - bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced) + bot_data_reduced = check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_buffer, bot_data_reduced) return bot_data_reduced From 93be3d67ec66fbc6a07fe58c5df50ffcd0932160 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Wed, 18 Feb 2026 14:31:59 +0100 Subject: [PATCH 20/32] Add better bot name variant support Add a helper function for creating bot name variants utilizing either '[bot]' or 'bot' suffix. Also update bot processing to check user buffer for all variants. Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 16 +++++++------- bot_processing/bot_processing.py | 8 +++++++ github_user_utils/github_user_utils.py | 21 ++++++++++++++++++- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index e908a2c..45ef0e7 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -55,9 +55,9 @@ log = getLogger(__name__) from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ is_github_noreply_author, github_user, github_email, \ - commit_added_event, mentioned_event, subscribed_event - + commit_added_event, mentioned_event, subscribed_event, generate_botname_variants +known_copilot_users_extended = generate_botname_variants(known_copilot_users) ## # RUN POSTPROCESSING ## @@ -125,7 +125,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth # keep author entry only if it should not be removed if not is_github_noreply_author(author[1], author[2]): # unify copilot author if desired - if unify_copilot_users and author[1] in known_copilot_users: + if unify_copilot_users and author[1] in known_copilot_users_extended: if not copilot_user_added: author[1] = copilot_unified_name author[2] = copilot_unified_email @@ -149,7 +149,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth # keep author entry only if it should not be removed if not is_github_noreply_author(email[0], email[1]): # unify copilot users if desired - if unify_copilot_users and email[0] in known_copilot_users: + if unify_copilot_users and email[0] in known_copilot_users_extended: email[0] = copilot_unified_name email[1] = copilot_unified_email email_data_new.append(email) @@ -173,7 +173,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth commit[5] = commit[2] commit[6] = commit[3] # unify copilot author if desired - if unify_copilot_users and commit[5] in known_copilot_users: + if unify_copilot_users and commit[5] in known_copilot_users_extended: commit[5] = copilot_unified_name commit[6] = copilot_unified_email @@ -197,13 +197,13 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth for event in issue_data: # unify events to use a single copilot user for all events triggered by a known copilot user - if unify_copilot_users and event[9] in known_copilot_users: + if unify_copilot_users and event[9] in known_copilot_users_extended: event[9] = copilot_unified_name event[10] = copilot_unified_email - if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users: + if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users_extended: # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user event[13] = '"' + copilot_unified_name + '"' - elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users: + elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users_extended: # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user event[12] = '"' + copilot_unified_name + '"' event[13] = '"' + copilot_unified_email + '"' diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 8c7df78..5d58eb9 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -224,6 +224,14 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): bot_reduced["user"] = user_buffer[user[0]] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) + elif user[0] + "bot" in user_buffer.keys(): + bot_reduced["user"] = user_buffer[user[0] + "bot"] + bot_reduced["prediction"] = user[-1] + bot_data_reduced.append(bot_reduced) + elif user[0] + "[bot]" in user_buffer.keys(): + bot_reduced["user"] = user_buffer[user[0] + "[bot]"] + bot_reduced["prediction"] = user[-1] + bot_data_reduced.append(bot_reduced) else: log.warning("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 20a3aa3..561a345 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -51,4 +51,23 @@ def is_github_noreply_author(name, email): :return: whether the given (name, email) pair belongs to the "GitHub " author """ - return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) \ No newline at end of file + return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) + +def generate_botname_variants(botnames): + """ + Helper function to generate variants of bot names, which are used in the list of + known bots and agents as well as during author postprocessing. + + :param botnames: the list of bot names for which variants should be generated + :return: a set of bot name variants + """ + + botname_variants = set() + for botname in botnames: + botname_variants.add(botname) + if botname.endswith("[bot]"): + botname_variants.add(botname[:-5] + "bot") + elif botname.endswith("bot"): + botname_variants.add(botname[:-3] + "[bot]") + + return botname_variants From 7c436c331adf7c237885b6603217d535a164d89d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 20 Feb 2026 17:17:46 +0100 Subject: [PATCH 21/32] Add better bot name handling Add a helper function that given a botname and a list of names, returns which bot name variant is contained in the list (or None). This is used whenever we check if a known bot is in the userdata or has been predicted to be a bot, and means that botnames in the known_bots file do not need to be duplicated for each variant. Also, automatically add all known coplilot users to the known_agents list, and then unify those during author postprocessing. Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 13 ++++- bot_processing/bot_processing.py | 55 ++++++++++++++----- github_user_utils/github_user_utils.py | 8 +-- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 45ef0e7..c19d2bc 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -250,6 +250,9 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth if bots_list in filenames: f = path.join(filepath, bots_list) log.info("Remove author %s <%s> from %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) + copilot_user_added = False bot_data = csv_writer.read_from_csv(f) bot_data_new = [] @@ -257,7 +260,15 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth for entry in bot_data: # keep bot entry only if it should not be removed if not is_github_noreply_author(entry[0], entry[1]): - bot_data_new.append(entry) + # unify copilot users if desired + if unify_copilot_users and entry[0] in known_copilot_users_extended: + if not copilot_user_added: + entry[0] = copilot_unified_name + entry[1] = copilot_unified_email + copilot_user_added = True + bot_data_new.append(entry) + else: + bot_data_new.append(entry) else: log.warning("Remove entry %s <%s> from bots list.", entry[0], entry[1]) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 5d58eb9..b63f87e 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -28,6 +28,7 @@ from codeface_utils.configuration import Configuration from csv_writer import csv_writer +from github_user_utils import known_copilot_users, generate_botname_variants # create logger setup_logging() @@ -139,25 +140,35 @@ def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_d for bot in known_bots: # (1) check if a known bot occurs in the GitHub issue data but has not been predicted - if bot[0] not in predicted_bots and bot[0] in user_data: + bot_variation_predicted_bots = containing_bot_variation(bot[0], predicted_bots) + bot_variation_user_data = containing_bot_variation(bot[0], user_data) + if bot_variation_predicted_bots is None and bot_variation_user_data is not None: # add the known bot as a bot to the bots list additional_bot = dict() - additional_bot["user"] = user_data[bot[0]] + additional_bot["user"] = user_data[bot_variation_user_data] additional_bot["prediction"] = "Bot" bot_data_reduced.append(additional_bot) log.info("Add known bot '{}' to bot data.".format(additional_bot["user"])) # (2) handle known bots that are already present in the bots list - elif bot[0] in predicted_bots and bot[0] in user_data: + elif bot_variation_predicted_bots is not None and bot_variation_user_data is not None: # make sure that this bot has also been predicited to be bot for predicted_bot in bot_data_reduced: - if predicted_bot["user"] == user_data[bot[0]]: + if predicted_bot["user"] == user_data[bot_variation_user_data]: predicted_bot["prediction"] = "Bot" - log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]])) + log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot_variation_user_data])) break + # get list of known agents and combine it with the list of known copilot users + copilot_users_variants = generate_botname_variants(known_copilot_users) + # get list of known agent names + known_agents_names = [agent[0] for agent in known_agents] + for copilot_user in copilot_users_variants: + if copilot_user not in known_agents_names: + known_agents.append([copilot_user]) + for agent in known_agents: # (1) check if a known agent occurs in the GitHub issue data but has not been predicted @@ -220,16 +231,9 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): continue # get user information if available - if user[0] in list(user_buffer.keys()): - bot_reduced["user"] = user_buffer[user[0]] - bot_reduced["prediction"] = user[-1] - bot_data_reduced.append(bot_reduced) - elif user[0] + "bot" in user_buffer.keys(): - bot_reduced["user"] = user_buffer[user[0] + "bot"] - bot_reduced["prediction"] = user[-1] - bot_data_reduced.append(bot_reduced) - elif user[0] + "[bot]" in user_buffer.keys(): - bot_reduced["user"] = user_buffer[user[0] + "[bot]"] + bot_variation = containing_bot_variation(user[0], user_buffer.keys()) + if bot_variation is not None: + bot_reduced["user"] = user_buffer[bot_variation] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) else: @@ -241,6 +245,27 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): return bot_data_reduced +def containing_bot_variation(botname, name_list): + """ + Helper function to return the variation of a given bot name that occurs in a list of names. + + :param botname: the bot name for which the variation should be returned + :param name_list: the list of names to be checked for containing the bot name or a variation of it + :return: the variation of the given bot name that occurs in the given list of names, or None if no such variation exists + """ + + if botname in name_list: + return botname + elif botname + "bot" in name_list: + return botname + "bot" + elif botname + "[bot]" in name_list: + return botname + "[bot]" + elif botname.replace("[", "").replace("]", "") in name_list: + return botname.replace("[", "").replace("]", "") + else: + return None + + def print_to_disk(bot_data, results_folder): """ Print bot data to file "bots.list" in result folder. diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 561a345..652b63b 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -24,7 +24,7 @@ ## # global variables containing all known copilot users and the name and mail adress copilot users will be assigned -known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agent[bot]"} copilot_unified_name = "Copilot" copilot_unified_email = "copilot@example.com" @@ -65,9 +65,7 @@ def generate_botname_variants(botnames): botname_variants = set() for botname in botnames: botname_variants.add(botname) - if botname.endswith("[bot]"): - botname_variants.add(botname[:-5] + "bot") - elif botname.endswith("bot"): - botname_variants.add(botname[:-3] + "[bot]") + botname = botname.replace("[", "").replace("]", "") + botname_variants.add(botname) return botname_variants From f0f95b3d2cf8e88e939349143097e88eabffac6d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 3 Mar 2026 15:43:11 +0100 Subject: [PATCH 22/32] Add copilot user unification for more events also add agents to bot handling, fix formatting for event_info_2 and subissues also fix a typo where strings would not have their quotes correctly removed Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 42 +++++++++++-------- bot_processing/bot_processing.py | 2 +- github_user_utils/__init__.py | 1 + github_user_utils/github_user_utils.py | 7 ++++ issue_processing/issue_processing.py | 8 ++-- 5 files changed, 38 insertions(+), 22 deletions(-) create mode 100644 github_user_utils/__init__.py diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index c19d2bc..cc0b912 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -50,12 +50,15 @@ from codeface_utils.configuration import Configuration from csv_writer import csv_writer +from github_user_utils.github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ + is_github_noreply_author, github_user, github_email, \ + commit_added_event, mentioned_event, subscribed_event, \ + assigned_event, unassigned_event, review_requested_event, \ + review_request_removed_event, generate_botname_variants, quot_m + # create logger setup_logging() log = getLogger(__name__) -from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ - is_github_noreply_author, github_user, github_email, \ - commit_added_event, mentioned_event, subscribed_event, generate_botname_variants known_copilot_users_extended = generate_botname_variants(known_copilot_users) ## @@ -176,6 +179,9 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth if unify_copilot_users and commit[5] in known_copilot_users_extended: commit[5] = copilot_unified_name commit[6] = copilot_unified_email + if unify_copilot_users and commit[2] in known_copilot_users_extended: + commit[2] = copilot_unified_name + commit[3] = copilot_unified_email csv_writer.write_to_csv(f, commit_data) @@ -194,19 +200,20 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data} author_name_to_data = {author[1]: author[1:3] for author in author_data_new} issue_data_new = [] - for event in issue_data: # unify events to use a single copilot user for all events triggered by a known copilot user if unify_copilot_users and event[9] in known_copilot_users_extended: event[9] = copilot_unified_name event[10] = copilot_unified_email - if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users_extended: - # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user - event[13] = '"' + copilot_unified_name + '"' - elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users_extended: - # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user - event[12] = '"' + copilot_unified_name + '"' - event[13] = '"' + copilot_unified_email + '"' + if unify_copilot_users and event[8] == commit_added_event and event[13][1:-1] in known_copilot_users_extended: + # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user + event[13] = quot_m + copilot_unified_name + quot_m + elif unify_copilot_users and event[8] in (mentioned_event, subscribed_event, assigned_event, unassigned_event, + review_requested_event, review_request_removed_event) \ + and event[12] in known_copilot_users_extended: + # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user + event[12] = copilot_unified_name + event[13] = quot_m + copilot_unified_email + quot_m # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 @@ -305,9 +312,6 @@ def run_postprocessing(conf, resdir, backup_data): bugs_jira_list = "bugs-jira.list" bots_list = "bots.list" - # When looking at elements originating from json lists, we need to consider quotation marks around the string - quot_m = "\"" - data_path = path.join(resdir, conf["project"], conf["tagging"]) # Correctly replace author 'GitHub ' in the commit data and in "commit_added" events of the @@ -398,8 +402,8 @@ def run_postprocessing(conf, resdir, backup_data): issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m # replace name in event info 2 if necessary - if person[4] == issue_event[13]: - issue_event[13] = person[1] + if quot_m + person[4] + quot_m == issue_event[13]: + issue_event[13] = quot_m + person[1] + quot_m csv_writer.write_to_csv(f, issue_data) @@ -466,8 +470,12 @@ def run_postprocessing(conf, resdir, backup_data): # the bot is already in the list, check if there are different predictions stored_bot = bot_names_and_emails[(bot[0], bot[1])] if stored_bot[2] != bot[2]: + # if either of the predictions is agent, keep agent + if (stored_bot[2] == "Agent" or bot[2] == "Agent"): + stored_bot[2] = "Agent" + bot_names_and_emails[(bot[0], bot[1])] = stored_bot # if either of the predictions is bot, keep bot - if (stored_bot[2] == "Bot" or bot[2] == "Bot"): + elif (stored_bot[2] == "Bot" or bot[2] == "Bot"): stored_bot[2] = "Bot" bot_names_and_emails[(bot[0], bot[1])] = stored_bot # otherwise, if either of the predictions is human, keep human diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index b63f87e..89bb559 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -28,7 +28,7 @@ from codeface_utils.configuration import Configuration from csv_writer import csv_writer -from github_user_utils import known_copilot_users, generate_botname_variants +from github_user_utils.github_user_utils import known_copilot_users, generate_botname_variants # create logger setup_logging() diff --git a/github_user_utils/__init__.py b/github_user_utils/__init__.py new file mode 100644 index 0000000..9bad579 --- /dev/null +++ b/github_user_utils/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 652b63b..20fa8d8 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -34,6 +34,13 @@ commit_added_event = "commit_added" mentioned_event = "mentioned" subscribed_event = "subscribed" +assigned_event = "assigned" +unassigned_event = "unassigned" +review_requested_event = "review_requested" +review_request_removed_event = "review_request_removed" + +# When looking at elements originating from json lists, we need to consider quotation marks around the string +quot_m = "\"" ## # UTILITY FUNCTIONS diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index f1ee54c..a81de96 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -40,7 +40,7 @@ from dateutil import parser as dateparser from csv_writer import csv_writer -from github_user_utils import copilot_unified_name +from github_user_utils.github_user_utils import copilot_unified_name # create logger setup_logging() @@ -757,9 +757,9 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev event["event_info_1"] = issue["state_new"] # if event is a review comment, it can contain suggestions if "contains_suggestion" in event: - event["event_info_2"] = event["contains_suggestion"] + event["event_info_2"] = str(event["contains_suggestion"]) else: - event["event_info_2"] = False + event["event_info_2"] = str(False) elif event["event"] == "referenced" and event["commit"] is not None: # remove "referenced" events originating from commits @@ -939,7 +939,7 @@ def print_to_disk(issues, results_folder): json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], - json.dumps([issue["subIssues"]]), # components + json.dumps(issue["subIssues"]), # components event["event"], event["user"]["name"], event["user"]["email"], From 6ed1def97d10c2c03cb9be57d12549645188d4b5 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 10 Mar 2026 13:58:06 +0100 Subject: [PATCH 23/32] Add reason for conversation locking lock reason is saved in event_info_1 Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index a81de96..7e992ac 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -537,6 +537,10 @@ def merge_issue_events(issue_data, external_connected_events): connected_info["user"] = event["user"] connected_events[event["created_at"]] = connected_info + # if event is a locked event, save the lock reason in event_info_1 + if event["event"] == "locked": + event["event_info_1"] = event["lock_reason"] + # merge events, relatedCommits, relatedIssues and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[ "relatedCommits"] + issue["reviewsList"] From 17810bec25b43f30a42f19a011aa9693b40d4304 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 10 Mar 2026 14:09:27 +0100 Subject: [PATCH 24/32] Fix spelling and documentation docstrings should now more accurately reflect parameters and return values Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 7e992ac..858dad1 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -237,7 +237,6 @@ def reformat_issues(issue_data): Re-arrange issue data structure. :param issue_data: the issue data to re-arrange - :return: the re-arranged issue data """ log.info("Re-arranging Github issues...") @@ -302,7 +301,8 @@ def merge_issue_events(issue_data, external_connected_events): All issue events are merged together in the eventsList. This simplifies processing in later steps. :param issue_data: the issue data from which the events shall be merged - :return: the issue data with merged eventsList + :param external_connected_events: a dict to store connected events to external issues + :return: a filtered dict of connected events for future reconstruction """ log.info("Merge issue events ...") @@ -576,17 +576,17 @@ def filter_connected_events(key, value, external_connected_events): # if 2 connected events exist, matching them is trivial if num_issues == 2: return True - occurences = {x: value["issues"].count(x) for x in set(value["issues"])} + occurrences = {x: value["issues"].count(x) for x in set(value["issues"])} # otherwise, if it is an even number, check if it can be easily matched, # meaning that exactly half the events occur in the same issue - if num_issues % 2 == 0 and num_issues/2 in occurences.values(): + if num_issues % 2 == 0 and num_issues/2 in occurrences.values(): # duplicate issue list for matching the issues later value["multi_issues_copy"] = list(value["issues"]) return True # if it is an odd number, check if it can be easily matched # meaning that exactly half (rounded up) the events occur in the same issue - if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values(): - for sub_key, sub_value in occurences.iteritems(): + if num_issues % 2 == 1 and (num_issues + 1)/2 in occurrences.values(): + for sub_key, sub_value in occurrences.iteritems(): # then, assign one of them as an external connected event and proceed as in previous case if sub_value == (num_issues + 1)/2: new_entry = dict() @@ -606,7 +606,8 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev Re-format event information dependent on the event type. :param issue_data: the data of all issues that shall be re-formatted - :return: the issue data with updated event information + :param filtered_connected_events: the dict of connected events which can be reconstructed + :param external_connected_events: the dict of connected events to external issues """ log.info("Update event information ...") @@ -646,9 +647,9 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev # and we only have 2 issues in the list, connect to the other issue event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] else: - # and we have more than two issues, count each issue's occurences - occurences = {x: value["issues"].count(x) for x in set(value["issues"])} - if occurences[issue["number"]] == max(occurences.values()): + # and we have more than two issues, count each issue's occurrences + occurrences = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurrences[issue["number"]] == max(occurrences.values()): # if our issue is the most common one, that means it is the common denominator # for all connected events at this time # so this event connects to any other issue @@ -658,7 +659,7 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev event["event_info_1"] = number else: # otherwise, connect this event to the common denominator - event["event_info_1"] = max(occurences, key=occurences.get) + event["event_info_1"] = max(occurrences, key=occurrences.get) # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: @@ -788,7 +789,6 @@ def insert_user_data(issues, conf, resdir): :param issues: the issues to retrieve user data from :param conf: the project configuration :param resdir: the directory in which the username-to-user-list should be dumped - :return: the updated issue data """ log.info("Syncing users with ID service...") From ca71f40eefe16a450b7b43981ab23e6c20cc203a Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 10 Mar 2026 14:12:43 +0100 Subject: [PATCH 25/32] Remove old state from jira state_updated events For consistency with github events Signed-off-by: Leo Sendelbach --- issue_processing/jira_issue_processing.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index a431ffa..6999764 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -467,21 +467,18 @@ def load_issues_via_api(issues, persons, url, referenced_bys): for change in changelog.histories: # default values for state and resolution - old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved" + new_state, old_resolution, new_resolution = "open", "unresolved", "unresolved" # all changes in the issue changelog are checked if they contain a useful information for item in change.items: # state_updated event gets created and added to the issue history if item.field == "status": - if item.fromString is not None: - old_state = item.fromString.lower() if item.toString is not None: new_state = item.toString.lower() history = dict() history["event"] = "state_updated" history["event_info_1"] = new_state - history["event_info_2"] = old_state if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: From 40bb0010a97fed74f04f2d5f6620cbf059ede821 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 13 Mar 2026 11:32:36 +0100 Subject: [PATCH 26/32] Fix jira processing error previously removed event_info_2 for state_updated event, leading to crashes of the issue processing. Now, it instead contains an empty string. Also fix a minor spelling mistake Signed-off-by: --- bot_processing/bot_processing.py | 2 +- issue_processing/jira_issue_processing.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 89bb559..0d0aa37 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -84,7 +84,7 @@ def load_bot_data(bot_file, header = True): # check if file exists and exit early if not if not os.path.exists(bot_file): - log.error("Bot file '{}' does not exist! Exiting early...".format(bot_file)) + log.error("Bot/Agent file '{}' does not exist (can be empty)! Exiting early...".format(bot_file)) sys.exit(-1) bot_data = csv_writer.read_from_csv(bot_file, delimiter=',') diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 6999764..dd15c78 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -19,7 +19,7 @@ # Copyright 2020-2021 by Thomas Bock # Copyright 2026 by Thomas Bock # Copyright 2023, 2025 by Maximilian Löffler -# Copyright 2025 by Leo Sendelbach +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. @@ -479,6 +479,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): history = dict() history["event"] = "state_updated" history["event_info_1"] = new_state + history["event_info_2"] = "" if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: From 3f0ea9809833fee90e847baccf1d4b2d6ce4c48e Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 12 May 2026 12:45:05 +0200 Subject: [PATCH 27/32] Fix issue with broken commits in merge events event_info_1 should remain empty in that case Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 858dad1..ae54a17 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -512,7 +512,7 @@ def merge_issue_events(issue_data, external_connected_events): event["user"] = event["assigner"] # if event is merged event, save the hash of the merge commit in event_info_1 - if event["event"] == "merged": + if event["event"] == "merged" and not event["commit"] is None: event["event_info_1"] = event["commit"]["hash"] # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction From d63e68b95ed112cfc69dd425556e6142cf1e6cee Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 26 May 2026 15:52:05 +0200 Subject: [PATCH 28/32] Fix 'null' in relatedIssues, subIssues fields crash These fields are now replaced with empty lists when null Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index ae54a17..8451ec7 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -270,11 +270,11 @@ def reformat_issues(issue_data): issue["reviewsList"] = [] # if an issue has no relatedIssues, an empty List gets created - if "relatedIssues" not in issue: + if issue["relatedIssues"] is None: issue["relatedIssues"] = [] # if an issue has no sub-issue list, an empty List gets created - if "subIssues" not in issue: + if issue["subIssues"] is None: issue["subIssues"] = [] # add "closed_at" information if not present yet From f66c10f4a0ad4fc9622ce87ddd39934f02084871 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 9 Jun 2026 13:11:25 +0200 Subject: [PATCH 29/32] Fix for python 3 changing .keys() call on maps after rebase onto python 3 branch Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 8451ec7..29aee23 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -517,15 +517,15 @@ def merge_issue_events(issue_data, external_connected_events): # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction if event["event"] == "connected": - if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]: + if event["created_at"] in list(connected_events.keys()) and connected_events[event["created_at"]]["user"] == event["user"]: # if there is already a connected event at this time by this user, add this event to the list connected_events[event["created_at"]]["issues"].append(issue["number"]) - elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \ + elif subtract_seconds_from_time(event["created_at"], 1) in list(connected_events.keys()) \ and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: # same as above, but accounting for a possible difference in timestamps of 1 second between matching events connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) - elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \ + elif subtract_seconds_from_time(event["created_at"], -1) in list(connected_events.keys()) \ and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: # same as above, with offset calculated in the other direction connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) From fc5d13ad212aff67654fb1629221d9f15b1029cb Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 9 Jun 2026 15:51:12 +0200 Subject: [PATCH 30/32] Fix iteritems Iteritems() does not work in python3, instrad use items() Signed-off-by: Leo Sendelbach --- github_user_utils/github_user_utils.py | 2 +- issue_processing/issue_processing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 20fa8d8..3db2efd 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -23,7 +23,7 @@ # GLOBAL VARIABLES ## -# global variables containing all known copilot users and the name and mail adress copilot users will be assigned +# global variables containing all known copilot users and the name and mail address copilot users will be assigned known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agent[bot]"} copilot_unified_name = "Copilot" copilot_unified_email = "copilot@example.com" diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 29aee23..9f69429 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -586,7 +586,7 @@ def filter_connected_events(key, value, external_connected_events): # if it is an odd number, check if it can be easily matched # meaning that exactly half (rounded up) the events occur in the same issue if num_issues % 2 == 1 and (num_issues + 1)/2 in occurrences.values(): - for sub_key, sub_value in occurrences.iteritems(): + for sub_key, sub_value in occurrences.items(): # then, assign one of them as an external connected event and proceed as in previous case if sub_value == (num_issues + 1)/2: new_entry = dict() From aae915b61523aaba36bfed03322d28b5238cabc1 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Sat, 13 Jun 2026 17:54:11 +0200 Subject: [PATCH 31/32] Add missing logger imports and add utf-8 header consistently where missing Signed-off-by: Thomas Bock --- anonymization/anonymization.py | 1 + author_postprocessing/author_postprocessing.py | 1 + bot_processing/bot_processing.py | 1 + codeface_utils/configuration.py | 1 + codeface_utils/dbmanager.py | 2 +- codeface_utils/linktype.py | 1 + codeface_utils/util.py | 1 + issue_processing/issue_processing.py | 1 + issue_processing/jira_issue_processing.py | 1 + 9 files changed, 9 insertions(+), 1 deletion(-) diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py index f4bfecb..291a575 100644 --- a/anonymization/anonymization.py +++ b/anonymization/anonymization.py @@ -34,6 +34,7 @@ from logging import getLogger from codeface_utils.configuration import Configuration +from codeface_utils.util import setup_logging from csv_writer import csv_writer # create logger diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index cc0b912..5840e4e 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -48,6 +48,7 @@ from logging import getLogger from codeface_utils.configuration import Configuration +from codeface_utils.util import setup_logging from csv_writer import csv_writer from github_user_utils.github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 0d0aa37..c51ed14 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -27,6 +27,7 @@ from logging import getLogger from codeface_utils.configuration import Configuration +from codeface_utils.util import setup_logging from csv_writer import csv_writer from github_user_utils.github_user_utils import known_copilot_users, generate_botname_variants diff --git a/codeface_utils/configuration.py b/codeface_utils/configuration.py index e4a654a..49d35ab 100644 --- a/codeface_utils/configuration.py +++ b/codeface_utils/configuration.py @@ -1,3 +1,4 @@ +# coding=utf-8 # This file is part of codeface-extraction, which is free software: you # can redistribute it and/or modify it under the terms of the GNU General # Public License as published by the Free Software Foundation, version 2. diff --git a/codeface_utils/dbmanager.py b/codeface_utils/dbmanager.py index aecc172..407a931 100644 --- a/codeface_utils/dbmanager.py +++ b/codeface_utils/dbmanager.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +# coding=utf-8 # This file is part of Codeface. Codeface is free software: you can # redistribute it and/or modify it under the terms of the GNU General Public # License as published by the Free Software Foundation, version 2. diff --git a/codeface_utils/linktype.py b/codeface_utils/linktype.py index 617d11f..953ce4f 100644 --- a/codeface_utils/linktype.py +++ b/codeface_utils/linktype.py @@ -1,3 +1,4 @@ +# coding=utf-8 # This file is part of codeface-extraction, which is free software: you # can redistribute it and/or modify it under the terms of the GNU General # Public License as published by the Free Software Foundation, version 2. diff --git a/codeface_utils/util.py b/codeface_utils/util.py index 59402d8..80c1b51 100644 --- a/codeface_utils/util.py +++ b/codeface_utils/util.py @@ -1,3 +1,4 @@ +# coding=utf-8 # This file is part of codeface-extraction, which is free software: you # can redistribute it and/or modify it under the terms of the GNU General # Public License as published by the Free Software Foundation, version 2. diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 9f69429..01a9fab 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -37,6 +37,7 @@ from codeface_utils.cluster.idManager import dbIdManager, csvIdManager from codeface_utils.configuration import Configuration from codeface_utils.dbmanager import DBManager +from codeface_utils.util import setup_logging from dateutil import parser as dateparser from csv_writer import csv_writer diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index dd15c78..b308d4b 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -38,6 +38,7 @@ from codeface_utils.cluster.idManager import dbIdManager, csvIdManager from codeface_utils.configuration import Configuration from codeface_utils.dbmanager import DBManager +from codeface_utils.util import setup_logging from csv_writer import csv_writer From a045792990a8c849f58797f672970ca819f69594 Mon Sep 17 00:00:00 2001 From: Thomas Bock Date: Sat, 13 Jun 2026 18:04:58 +0200 Subject: [PATCH 32/32] Fix encoding issue in python3 As strings are already utf-8 encoded, don't convert them to utf-8 encoded strings any more. Signed-off-by: Thomas Bock --- codeface_extraction/extractions.py | 8 +++----- codeface_utils/cluster/idManager.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py index 9c636dd..0210b73 100644 --- a/codeface_extraction/extractions.py +++ b/codeface_extraction/extractions.py @@ -15,6 +15,7 @@ # Copyright 2015-2018 by Claus Hunsen # Copyright 2016, 2018-2019 by Thomas Bock # Copyright 2019, 2021 by Thomas Bock +# Copyright 2026 by Thomas Bock # Copyright 2018 by Barbara Eckl # Copyright 2018 by Tina Schuh # Copyright 2025 by Maximilian Löffler @@ -759,18 +760,15 @@ def fix_name_encoding(name): if name is None: return name - # encode utf-8 - name = name.encode('utf-8') - # find out character set of the encoded name - info = decode_header(str(name)) + info = decode_header(name) try: # Apply correct encoding and return unicode string return str(make_header(info)) except UnicodeDecodeError: # Undo utf-8 encoding and return unicode string - return str(name.decode('utf-8')) + return name except LookupError: # Encoding not found, return string as is return name diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py index 43a4be5..b7c3c0d 100644 --- a/codeface_utils/cluster/idManager.py +++ b/codeface_utils/cluster/idManager.py @@ -14,6 +14,7 @@ # Copyright 2010, 2011 by Wolfgang Mauerer # Copyright 2012, 2013 by Siemens AG, Wolfgang Mauerer # Copyright 2025 by Maximilian Löffler +# Copyright 2026 by Thomas Bock # All Rights Reserved. # # The code in this file originates from: @@ -225,7 +226,7 @@ def getPersonFromDB(self, person_id): log.exception("Could not reach ID service. Is the server running?\n") raise - result = res.read() + result = res.read().decode("utf-8") jsond = json.loads(result)[0] return (jsond)