Skip to content
Draft
Changes from 15 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
eabf9bb
Created base script to construct dataset for backout commits
benjaminmah May 2, 2024
aaf8386
Created new directory to store dataset, added comments to script
benjaminmah May 2, 2024
c096468
Cleaned up code, restructured dataset to include the inducing, backou…
benjaminmah May 3, 2024
24046fd
Sample dataset (count_limit = 500)
benjaminmah May 3, 2024
3eb6605
Removed old datasets
benjaminmah May 3, 2024
2db5029
Skip 'fixing commits' that are actually backout commits
benjaminmah May 3, 2024
3516c09
Sample dataset (num_count = 500)
benjaminmah May 3, 2024
0544b27
Deleted dataset
benjaminmah May 6, 2024
49570ac
Added cache for processed dictionaries, removed unused fields, simpli…
benjaminmah May 6, 2024
fc37940
Split up function `filter_commits` to handle saving to directory and …
benjaminmah May 6, 2024
10314dd
Replaced list with generator, stylized code to match standard coding …
benjaminmah May 6, 2024
943eb40
Removed commented out code
benjaminmah May 6, 2024
8ed0784
Added new file to log commits that do not have a fix commit, used `bu…
benjaminmah May 7, 2024
39ab450
Added metric collection for number of fixes found, number of no fixes…
benjaminmah May 8, 2024
fe8114b
Added condition to only append to dataset if the number of non backed…
benjaminmah May 8, 2024
74939f2
Added the diff between the original commit and the fixing commit in t…
benjaminmah May 10, 2024
be10d51
Removed separating by `added_lines` and `removed_lines`, storing raw …
benjaminmah May 10, 2024
3a406ef
Added threshold for number of changes and separated diffs by file.
benjaminmah May 13, 2024
bc23a22
Added support for hglib grafting from `repository.py`
benjaminmah May 14, 2024
6058305
Added grafting support to apply original commit to parent commit of t…
benjaminmah May 14, 2024
e666c2e
Cleaned up code
benjaminmah May 15, 2024
40bbe1b
Removed storing bugs without fixes, limited bugs to be within the las…
benjaminmah May 15, 2024
a4c5bff
Reverted to storing the raw diff as a utf-8 encoded string.
benjaminmah May 15, 2024
f133041
Removed unnecessary fields when populating dataset, extract correct d…
benjaminmah May 21, 2024
d202b0b
Fixed type hinting
benjaminmah May 22, 2024
79152a3
Added `hg merge-tool` for automatically resolving conflicts when graf…
benjaminmah May 22, 2024
4740196
Fixed docstring for function `graft`
benjaminmah May 22, 2024
38d6cf8
Added check to omit any diff containing conflicts
benjaminmah May 23, 2024
9fc018c
Made code more Pythonic
benjaminmah May 27, 2024
846210f
Changed standard collections to generic types
benjaminmah Jun 3, 2024
ae28dcf
Implemented logging error when shelving changes
benjaminmah Jun 3, 2024
c6f6a8f
Implemented logging error when grafting
benjaminmah Jun 3, 2024
37c51b6
Renamed `bug_dict` and `bug_info` to `bug_resolution_map` and `bug_re…
benjaminmah Jun 3, 2024
fad6df6
Removed `commit_dict`
benjaminmah Jun 3, 2024
fb7a17d
Changed `logger.info` to `logger.warning` when error encountered whil…
benjaminmah Jun 4, 2024
bfc77e4
Reverted importing standard collections
benjaminmah Jun 4, 2024
66108ad
Added raise-from when shelving
benjaminmah Jun 4, 2024
0d83fa7
Removed try-except when grafting
benjaminmah Jun 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 250 additions & 0 deletions scripts/backout_data_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
import json
import logging
import os
from typing import Any, Dict, Generator, Tuple
Comment thread
suhaibmujahid marked this conversation as resolved.
Outdated

from tqdm import tqdm

from bugbug import bugzilla, db, repository

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def download_databases() -> None:
logger.info("Downloading bugs database...")
assert db.download(bugzilla.BUGS_DB)

logger.info("Downloading commits database...")
assert db.download(repository.COMMITS_DB, support_files_too=True)


def preprocess_commits_and_bugs() -> Tuple[Dict, Dict, Dict]:
logger.info("Preprocessing commits and bugs...")
commit_dict = {}
bug_to_commit_dict = {}

logger.info("Preprocessing commits...")
# store commits with their hashes and bug IDs as keys
for commit in tqdm(
Comment thread
suhaibmujahid marked this conversation as resolved.
Outdated
repository.get_commits(
include_no_bug=True, include_backouts=True, include_ignored=True
),
desc="Preprocessing commits",
):
commit_dict[commit["node"]] = {
"node": commit["node"],
"bug_id": commit["bug_id"],
"desc": commit["desc"],
"pushdate": commit["pushdate"],
"backedoutby": commit["backedoutby"],
"backsout": commit["backsout"],
}

if commit_dict[commit["node"]]["bug_id"] not in bug_to_commit_dict:
Comment thread
suhaibmujahid marked this conversation as resolved.
Outdated
bug_to_commit_dict[commit["bug_id"]] = [commit_dict[commit["node"]]]
else:
bug_to_commit_dict[commit["bug_id"]].append(commit_dict[commit["node"]])

logger.info("Preprocessing bugs...")
bug_dict = {}

# store bugs with their bug IDs as keys
for bug in tqdm(bugzilla.get_bugs(include_invalid=True), desc="Preprocessing bugs"):
bug_dict[bug.get("id")] = bug["resolution"]

return commit_dict, bug_to_commit_dict, bug_dict


def filter_commits(
commit_limit: int,
commit_dict: dict,
bug_to_commit_dict: dict,
bug_dict: dict,
) -> Generator[Dict[str, Any], None, None]:
counter = 0
commit_limit = min(commit_limit, 709458)

logger.info("Filtering commits...")

for commit in repository.get_commits(
include_no_bug=True, include_backouts=True, include_ignored=True
):
bug_info = bug_dict.get(commit["bug_id"])
Comment thread
suhaibmujahid marked this conversation as resolved.
Outdated

counter += 1

# add commit if it was backed out and the bug is fixed
if commit["backedoutby"] and bug_info == "FIXED":
fixing_commit, non_backed_out_commits = find_next_commit(
commit["bug_id"],
bug_to_commit_dict,
commit["node"],
commit["backedoutby"],
)

# if fixing commit could not be found, do not add to the dataset
# instead, will log and add to separate file
if not fixing_commit:
yield {
"non_backed_out_commits": non_backed_out_commits,
"fix_found": False,
"bug_id": commit["bug_id"],
"inducing_commit": {
"node": commit["node"],
"pushdate": commit["pushdate"],
"desc": commit["desc"],
},
"backout_commit": {
"node": commit["backedoutby"],
"pushdate": commit_dict[commit["backedoutby"]]["pushdate"],
"desc": commit_dict[commit["backedoutby"]]["desc"],
},
}
continue

# generate the hashes of the bug-inducing commit, the backout commit, and the fixing commit
# include metadata such as push date and description for further context
yield {
"non_backed_out_commits": non_backed_out_commits,
"fix_found": True,
"bug_id": commit["bug_id"],
"inducing_commit": {
"node": commit["node"],
"pushdate": commit["pushdate"],
"desc": commit["desc"],
},
"backout_commit": {
"node": commit["backedoutby"],
"pushdate": commit_dict[commit["backedoutby"]]["pushdate"],
"desc": commit_dict[commit["backedoutby"]]["desc"],
},
"fixing_commit": {
"node": fixing_commit["node"],
"pushdate": fixing_commit["pushdate"],
"desc": fixing_commit["desc"],
},
}

if counter >= commit_limit:
break


def find_next_commit(
bug_id: int, bug_to_commit_dict: dict, inducing_node: str, backout_node: str
) -> Tuple[Dict, int]:
backout_commit_found = False
fixing_commit = None

non_backed_out_counter = 0

for commit in bug_to_commit_dict[bug_id]:
# if the backout commit is found, find the next commit that isn't backed out by any other commit
if backout_commit_found:
if (
not commit["backedoutby"]
and not fixing_commit
and not commit["backsout"]
):
fixing_commit = commit
non_backed_out_counter += 1
elif not commit["backedoutby"]:
non_backed_out_counter += 1

if commit["node"] == backout_node:
backout_commit_found = True

if (
not fixing_commit
or fixing_commit["node"] == inducing_node
or fixing_commit["node"] == backout_node
):
return {}, non_backed_out_counter

return fixing_commit, non_backed_out_counter


def save_datasets(
directory_path: str,
dataset_filename: str,
no_fix_commit_filename: str,
data_generator,
) -> None:
if not os.path.exists(directory_path):
os.makedirs(directory_path)
logger.info(f"Directory {directory_path} created")

dataset_filepath = os.path.join(directory_path, dataset_filename)
no_fix_commit_filepath = os.path.join(directory_path, no_fix_commit_filename)

fix_found_counter = 0
no_fix_found_counter = 0
backed_out_counter = 0

with open(dataset_filepath, "w") as file1, open(
no_fix_commit_filepath, "w"
) as file2:
file1.write("[\n")
first1 = True

file2.write("[\n")
first2 = True

for item in data_generator:
if item["non_backed_out_commits"] > 1:
backed_out_counter += 1

# item.pop("non_backed_out_commits", None)

if item["fix_found"] and item["non_backed_out_commits"] <= 2:
item.pop("fix_found", None)
if not first1:
file1.write(",\n")
json_data = json.dumps(item, indent=4)
file1.write(json_data)
first1 = False
fix_found_counter += 1
elif not item["fix_found"]:
item.pop("fix_found", None)
if not first2:
file2.write(",\n")
json_data = json.dumps(item, indent=4)
file2.write(json_data)
first2 = False
no_fix_found_counter += 1

file1.write("\n]")
file2.write("\n]")

logger.info(f"Dataset successfully saved to {dataset_filepath}")
logger.info(f"Commits without a fix successfully saved to {no_fix_commit_filepath}")

logger.info(f"Number of commits with fix found saved: {fix_found_counter}")
logger.info(f"Number of commits with no fix found saved: {no_fix_found_counter}")
logger.info(
f"Number of commits with multiple non backed out commits following it: {backed_out_counter}"
)


def main():
download_databases()

commit_dict, bug_to_commit_dict, bug_dict = preprocess_commits_and_bugs()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to consider the space complexity when iterating over the whole dataset.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed unused keys when constructing the dictionaries and implemented a cache to use generated dictionaries from previous instances of running the code via saving them as JSON files. Let me know if this needs additional changes/fixes!


data_generator = filter_commits(
commit_limit=1000000,
commit_dict=commit_dict,
bug_to_commit_dict=bug_to_commit_dict,
bug_dict=bug_dict,
)

save_datasets(
directory_path="dataset",
dataset_filename="backout_dataset.json",
no_fix_commit_filename="no_fix_dataset.json",
data_generator=data_generator,
)


if __name__ == "__main__":
main()