Skip to content

Commit 53086b6

Browse files
Merge pull request #396 from WilliamsCJ/license-text-extraction
Extend license detection to include extracted license text
2 parents 392d515 + 0b03c66 commit 53086b6

4 files changed

Lines changed: 80 additions & 26 deletions

File tree

inspect4py/cli.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,10 +1334,13 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir
13341334
if license_detection:
13351335
try:
13361336
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "licenses")
1337-
rank_list = detect_license(input_path, licenses_path)
1338-
dir_info["detected_license"] = [{k: f"{v:.1%}"} for k, v in rank_list]
1339-
except:
1340-
pass
1337+
license_text = extract_license(input_path)
1338+
rank_list = detect_license(license_text, licenses_path)
1339+
dir_info["license"] = {}
1340+
dir_info["license"]["detected_type"] = [{k: f"{v:.1%}"} for k, v in rank_list]
1341+
dir_info["license"]["extracted_text"] = license_text
1342+
except Exception as e:
1343+
print("Error when detecting license: %s", str(e))
13411344
if readme:
13421345
dir_info["readme_files"] = extract_readme(input_path)
13431346
if metadata:

inspect4py/utils.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -653,13 +653,16 @@ def dice_coefficient(a, b):
653653
return dice_coeff
654654

655655

656-
def detect_license(input_path, licenses_path, threshold=0.9):
657-
"""
658-
Function to detect the license of a file.
659-
:param input_path: Path of the repository to be analyzed.
660-
:param licenses_path: Path to the folder containing license templates.
661-
:param threshold: Threshold to consider a license as detected,
662-
a float number between 0 and 1.
656+
def extract_license(input_path):
657+
"""Extracts the license of the repository.
658+
Args:
659+
input_path (str): Path of the repository to be analyzed.
660+
661+
Returns:
662+
Optional[str]: The license text
663+
664+
Raises:
665+
Exception: If a license file is not found.
663666
"""
664667
license_filenames = [
665668
"LICENSE",
@@ -671,21 +674,39 @@ def detect_license(input_path, licenses_path, threshold=0.9):
671674
"COPYING.md",
672675
"COPYING.rst",
673676
]
677+
674678
license_file = None
675679
for filename in os.listdir(input_path):
676680
if filename in license_filenames:
677681
license_file = os.path.join(input_path, filename)
678682
break
683+
679684
if license_file is None:
680-
return "No license file detected"
685+
raise Exception("License file not found.")
681686

682687
with open(license_file, "r") as f:
683688
license_text = f.read()
684689

690+
return license_text
691+
692+
693+
def detect_license(license_text, licenses_path, threshold=0.9):
694+
"""
695+
Function to detect the license type from extracted text.
696+
697+
Args:
698+
license_text (str): The extracted license text.
699+
licenses_path (str): Path of the folder containing license templates.
700+
threshold (float): Threshold to consider a license as detected. A float between 0 and 1.
701+
702+
Returns:
703+
Ranked list of license types and their percentage match to the supplied license_text.
704+
"""
685705
# Regex pattern for preprocessing license templates and extract spdx id
686706
pattern = re.compile(
687707
"(---\n.*(spdx-id: )(?P<id>.+?)\n.*---\n)(?P<template>.*)", re.DOTALL
688708
)
709+
689710
rank_list = []
690711
for licen in os.listdir(licenses_path):
691712
with open(os.path.join(licenses_path, licen), "r") as f:
@@ -699,11 +720,7 @@ def detect_license(input_path, licenses_path, threshold=0.9):
699720
if dice_coeff > threshold:
700721
rank_list.append((spdx_id, dice_coeff))
701722

702-
if rank_list:
703-
return sorted(rank_list, key=lambda t: t[1], reverse=True)
704-
705-
return "License not recognised"
706-
723+
return sorted(rank_list, key=lambda t: t[1], reverse=True)
707724

708725
def extract_readme(input_path: str) -> dict:
709726
"""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
A random license.

test/test_inspect4py.py

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -535,10 +535,11 @@ def test_source_code_body(self):
535535
actual_code = code_info.fileJson[0]["body"]["source_code"]
536536
assert expected_code == actual_code
537537

538+
538539
def test_license_detection(self):
539540
input_paths = ["./test_files/Chowlk", "./test_files/pylops", "./test_files/somef"]
540541
output_dir = "./output_dir"
541-
542+
fig = False
542543
ignore_dir_pattern = [".", "__pycache__"]
543544
ignore_file_pattern = [".", "__pycache__"]
544545
requirements = False
@@ -555,14 +556,42 @@ def test_license_detection(self):
555556
expected_liceses = ['Apache-2.0', 'LGPL-3.0', 'MIT']
556557
first_rank_licenses = []
557558
for input_path in input_paths:
558-
dir_info = invoke_inspector(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requirements,
559-
call_list, control_flow, directory_tree, software_invocation, abstract_syntax_tree,
560-
source_code, license_detection, readme, metadata)
561-
first_rank_licenses.append(next(iter(dir_info["detected_license"][0])))
559+
dir_info = invoke_inspector(input_path, output_dir, ignore_dir_pattern,
560+
ignore_file_pattern, requirements,
561+
call_list, control_flow, directory_tree,
562+
software_invocation, abstract_syntax_tree,
563+
source_code, license_detection, readme, metadata)
564+
first_rank_licenses.append(next(iter(dir_info["license"]["detected_type"][0])))
562565
shutil.rmtree(output_dir)
563-
566+
564567
assert first_rank_licenses == expected_liceses
565568

569+
def test_license_text_extraction(self):
570+
license_text = "A random license."
571+
input_path = "./test_files/test_license_extraction"
572+
output_dir = "./output_dir"
573+
fig = False
574+
ignore_dir_pattern = [".", "__pycache__"]
575+
ignore_file_pattern = [".", "__pycache__"]
576+
requirements = False
577+
call_list = False
578+
control_flow = False
579+
directory_tree = False
580+
software_invocation = False
581+
abstract_syntax_tree = False
582+
source_code = False
583+
license_detection = True
584+
readme = False
585+
metadata = False
586+
587+
dir_info = invoke_inspector(input_path, output_dir, ignore_dir_pattern,
588+
ignore_file_pattern, requirements,
589+
call_list, control_flow, directory_tree, software_invocation,
590+
abstract_syntax_tree,
591+
source_code, license_detection, readme, metadata)
592+
593+
assert dir_info["license"]["extracted_text"] == license_text
594+
566595

567596
def test_readme(self):
568597
input_path = "./test_files/test_readme"
@@ -701,9 +730,13 @@ def invoke_inspector(input_path, output_dir, ignore_dir_pattern, ignore_file_pat
701730
# Extract the first for software type.
702731
dir_info["software_type"] = rank_software_invocation(soft_invocation_info_list)
703732
if license_detection:
704-
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../inspect4py/licenses")
705-
rank_list = detect_license(input_path, licenses_path)
706-
dir_info["detected_license"] = [{k: f"{v:.1%}"} for k, v in rank_list]
733+
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
734+
"../inspect4py/licenses")
735+
license_text = extract_license(input_path)
736+
rank_list = detect_license(license_text, licenses_path)
737+
dir_info["license"] = {}
738+
dir_info["license"]["detected_type"] = [{k: f"{v:.1%}"} for k, v in rank_list]
739+
dir_info["license"]["extracted_text"] = license_text
707740
if readme:
708741
dir_info["readme_files"] = extract_readme(input_path)
709742
if metadata:

0 commit comments

Comments
 (0)