-
Notifications
You must be signed in to change notification settings - Fork 330
Dataset creation for backout commits #4159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 29 commits
eabf9bb
aaf8386
c096468
24046fd
3eb6605
2db5029
3516c09
0544b27
49570ac
fc37940
10314dd
943eb40
8ed0784
39ab450
fe8114b
74939f2
be10d51
3a406ef
bc23a22
6058305
e666c2e
40bbe1b
a4c5bff
f133041
d202b0b
79152a3
4740196
38d6cf8
9fc018c
846210f
ae28dcf
c6f6a8f
37c51b6
fad6df6
fb7a17d
bfc77e4
66108ad
0d83fa7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,226 @@ | ||
| import json | ||
| import logging | ||
| import os | ||
| from collections.abc import Generator | ||
| from datetime import datetime, timedelta | ||
|
|
||
| from tqdm import tqdm | ||
|
|
||
| from bugbug import bugzilla, db, repository | ||
|
|
||
| logging.basicConfig(level=logging.INFO) | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def download_databases() -> None: | ||
| logger.info("Cloning Mercurial database...") | ||
| repository.clone(repo_dir="hg_dir") | ||
|
|
||
| logger.info("Downloading bugs database...") | ||
| assert db.download(bugzilla.BUGS_DB) | ||
|
|
||
| logger.info("Downloading commits database...") | ||
| assert db.download(repository.COMMITS_DB, support_files_too=True) | ||
|
|
||
|
|
||
| def preprocess_commits_and_bugs() -> tuple[dict, dict, dict]: | ||
| logger.info("Preprocessing commits and bugs...") | ||
| commit_dict, bug_dict = {}, {} | ||
| bug_to_commit_dict: dict[int, list] = {} | ||
|
|
||
| for commit in repository.get_commits( | ||
| include_no_bug=True, include_backouts=True, include_ignored=True | ||
| ): | ||
| commit_data = { | ||
| key: commit[key] | ||
| for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout"] | ||
| } | ||
| commit_dict[commit["node"]] = commit_data | ||
|
|
||
| bug_to_commit_dict.setdefault(commit["bug_id"], []).append(commit_data) | ||
|
|
||
| # We only require the bug's resolution (to check if it is 'FIXED'). | ||
| bug_dict = { | ||
| bug["id"]: bug["resolution"] for bug in bugzilla.get_bugs(include_invalid=True) | ||
| } | ||
|
|
||
| return commit_dict, bug_to_commit_dict, bug_dict | ||
|
|
||
|
|
||
| def has_conflicts(diff: str) -> bool: | ||
| """Return True if the diff contains any conflict markers. Used with merge-tool ':fail'.""" | ||
| conflict_markers = ["<<<<<<<", "=======", ">>>>>>>"] | ||
| return any(marker in diff for marker in conflict_markers) | ||
|
|
||
|
|
||
| def generate_datapoints( | ||
| commit_limit: int, | ||
| commit_dict: dict, | ||
| bug_to_commit_dict: dict, | ||
| bug_dict: dict, | ||
| repo_dir: str, | ||
| ) -> Generator[dict, None, None]: | ||
| counter = 0 | ||
| commit_limit = min(commit_limit, 709458) | ||
|
|
||
| logger.info("Generating datapoints...") | ||
|
|
||
| for commit in tqdm( | ||
| repository.get_commits( | ||
| include_no_bug=True, include_backouts=True, include_ignored=True | ||
| ) | ||
| ): | ||
| counter += 1 | ||
|
|
||
| bug_info = bug_dict.get(commit["bug_id"]) | ||
|
suhaibmujahid marked this conversation as resolved.
Outdated
|
||
|
|
||
| pushdate = datetime.strptime(commit["pushdate"], "%Y-%m-%d %H:%M:%S") | ||
|
|
||
| if (datetime.now() - pushdate) > timedelta(days=730): | ||
| continue | ||
|
|
||
| if not commit["backedoutby"] or bug_info != "FIXED": | ||
| continue | ||
|
|
||
| # We only add the commit if it has been backed out and the bug it is for is FIXED. | ||
| fixing_commit, non_backed_out_commits = find_next_commit( | ||
| commit["bug_id"], | ||
| bug_to_commit_dict, | ||
| commit["node"], | ||
| commit["backedoutby"], | ||
| ) | ||
|
|
||
| if not fixing_commit or non_backed_out_commits > 1: | ||
| continue | ||
|
|
||
| commit_diff = repository.get_diff( | ||
| repo_dir, commit["node"], fixing_commit["node"] | ||
| ) | ||
|
|
||
| if not commit_diff: | ||
| continue | ||
|
|
||
| commit_diff_encoded = commit_diff.decode("utf-8") | ||
|
|
||
| if has_conflicts(commit_diff_encoded): | ||
| continue | ||
|
|
||
| yield { | ||
| "non_backed_out_commits": non_backed_out_commits, | ||
| "fix_found": True, | ||
| "bug_id": commit["bug_id"], | ||
| "inducing_commit": commit["node"], | ||
| "backout_commit": commit["backedoutby"], | ||
| "fixing_commit": fixing_commit["node"], | ||
| "commit_diff": commit_diff_encoded, | ||
| } | ||
|
|
||
| if counter >= commit_limit: | ||
| break | ||
|
|
||
|
|
||
| def find_next_commit( | ||
| bug_id: int, bug_to_commit_dict: dict, inducing_node: str, backout_node: str | ||
| ) -> tuple[dict, int]: | ||
| backout_commit_found = False | ||
| fixing_commit = None | ||
|
|
||
| non_backed_out_counter = 0 | ||
|
|
||
| for commit in bug_to_commit_dict[bug_id]: | ||
| # If the backout commit has been found in the bug's commit history, | ||
| # find the next commit that has not been backed out or backs out other commits. | ||
| if backout_commit_found: | ||
| if ( | ||
| not commit["backedoutby"] | ||
| and not fixing_commit | ||
| and not commit["backsout"] | ||
| ): | ||
| fixing_commit = commit | ||
| non_backed_out_counter += 1 | ||
| elif not commit["backedoutby"]: | ||
| non_backed_out_counter += 1 | ||
|
|
||
| if commit["node"] == backout_node: | ||
| backout_commit_found = True | ||
|
|
||
| if ( | ||
| not fixing_commit | ||
| or fixing_commit["node"] == inducing_node | ||
| or fixing_commit["node"] == backout_node | ||
| ): | ||
| return {}, non_backed_out_counter | ||
|
|
||
| return fixing_commit, non_backed_out_counter | ||
|
|
||
|
|
||
| def save_datasets( | ||
| directory_path: str, dataset_filename: str, data_generator, batch_size: int = 10 | ||
| ) -> None: | ||
| os.makedirs(directory_path, exist_ok=True) | ||
| logger.info(f"Directory {directory_path} created") | ||
|
|
||
| dataset_filepath = os.path.join(directory_path, dataset_filename) | ||
|
|
||
| fix_found_counter = 0 | ||
| fix_batch = [] | ||
|
|
||
| with open(dataset_filepath, "w") as file: | ||
| file.write("[\n") | ||
| first = True | ||
|
|
||
| logger.info("Populating dataset...") | ||
| for item in data_generator: | ||
| item.pop("fix_found", None) | ||
| fix_batch.append(item) | ||
| fix_found_counter += 1 | ||
|
|
||
| if len(fix_batch) >= batch_size: | ||
| if not first: | ||
| file.write(",\n") | ||
| else: | ||
| first = False | ||
|
|
||
| json_data = ",\n".join(json.dumps(i, indent=4) for i in fix_batch) | ||
| file.write(json_data) | ||
| file.flush() | ||
| os.fsync(file.fileno()) | ||
| fix_batch = [] | ||
|
|
||
| if fix_batch: | ||
| if not first: | ||
| file.write(",\n") | ||
| json_data = ",\n".join(json.dumps(i, indent=4) for i in fix_batch) | ||
| file.write(json_data) | ||
| file.flush() | ||
| os.fsync(file.fileno()) | ||
|
|
||
| file.write("\n]") | ||
|
|
||
| logger.info(f"Dataset successfully saved to {dataset_filepath}") | ||
| logger.info(f"Number of commits with fix found saved: {fix_found_counter}") | ||
|
|
||
|
|
||
| def main(): | ||
| download_databases() | ||
|
|
||
| commit_dict, bug_to_commit_dict, bug_dict = preprocess_commits_and_bugs() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may want to consider the space complexity when iterating over the whole dataset.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed unused keys when constructing the dictionaries and implemented a cache to use generated dictionaries from previous instances of running the code via saving them as JSON files. Let me know if this needs additional changes/fixes! |
||
|
|
||
| data_generator = generate_datapoints( | ||
| commit_limit=1000000, | ||
| commit_dict=commit_dict, | ||
| bug_to_commit_dict=bug_to_commit_dict, | ||
| bug_dict=bug_dict, | ||
| repo_dir="hg_dir", | ||
| ) | ||
|
|
||
| save_datasets( | ||
| directory_path="dataset", | ||
| dataset_filename="backout_dataset.json", | ||
| data_generator=data_generator, | ||
| batch_size=1, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
Uh oh!
There was an error while loading. Please reload this page.