Skip to content

Commit 21550f8

Browse files
authored
Merge pull request #418 from SoftwareUnderstanding/dev
Adding data flow PRs
2 parents 9da8fe5 + 21da57d commit 21550f8

13 files changed

Lines changed: 671 additions & 272 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ build/*
22
dist/*
33
env/*
44
.vscode/
5+
.idea/
56
.ipynb_checkpoints/
67
config.json
78
__pycache__/

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ Inspect4py currently works **only for Python 3 projects**.
2727

2828
## Background:
2929

30+
`inspect4py` added the functionality of capture [Data Flow Graphs](http://bears.ece.ucsb.edu/research-info/DP/dfg.html) for each function inspired by GraphCodeBERT: [Github](https://github.com/microsoft/CodeBERT) & [Paper](https://arxiv.org/abs/2009.08366). The illustration is given:
31+
|Source Code|List Output|Networkx Image|
32+
|:-:|:-:|:-:|
33+
|<pre>def max(a, b):<br>x = 0<br> if a > b:<br> x = a<br>else:<br> x = b<br> return x</pre>|<pre>('a', 3, 'comesFrom', [], [])<br>('b', 5, 'comesFrom', [], [])<br>('x', 8, 'computedFrom', ['0'], [10])<br>('0', 10, 'comesFrom', [], [])<br>('a', 12, 'comesFrom', ['a'], [3])<br>('b', 14, 'comesFrom', ['b'], [5])<br>('x', 16, 'computedFrom', ['a'], [18])<br>('a', 18, 'comesFrom', ['a'], [3])<br>('x', 21, 'computedFrom', ['b'], [23])<br>('b', 23, 'comesFrom', ['b'], [5])<br>('x', 25, 'comesFrom', ['x'], [16, 21])</pre>|![image](docs/images/data_flow.png)|
34+
3035
`inspect4py` uses [ASTs](https://en.wikipedia.org/wiki/Abstract_syntax_tree), more specifically
3136
the [ast](https://docs.python.org/3/library/ast.html) module in Python, generating
3237
a tree of objects (per file) whose classes all inherit from [ast.AST](https://docs.python.org/3/library/ast.html#ast.AST).
@@ -60,6 +65,12 @@ Please cite our MSR 2022 demo paper:
6065

6166
### Preliminaries
6267

68+
Make sure you have tree-sitter installed, C complier is needed, more [info](https://github.com/tree-sitter/tree-sitter):
69+
70+
```
71+
pip install tree-sitter
72+
```
73+
6374
Make sure you have graphviz installed:
6475

6576
```
@@ -71,7 +82,7 @@ We have tested `inspect4py` in Python 3.7+. **Our recommended version is Python
7182

7283

7384
### Operative System
74-
We have tested `inspect4py` in Unix and MacOs.
85+
We have tested `inspect4py` in Unix, MacOS and Windows 11(22621.1265).
7586

7687
### Installation from pypi
7788
`inspect4py` is [available in pypi!](https://pypi.org/project/inspect4py/) Just install it like a regular package:
@@ -106,6 +117,9 @@ pigar
106117
setuptools==54.2.0
107118
json2html
108119
configparser
120+
bigcode_astgen
121+
GitPython
122+
tree-sitter
109123
```
110124

111125
If you want to run the evaluations, do not forget to add `pandas` to the previous set.
@@ -218,6 +232,8 @@ Options:
218232
-rm, --readme extract all readme files in the target repository.
219233
-md, --metadata extract metadata of the target repository using
220234
Github API.
235+
-df, --data_flow extract data flow graph for every function, BOOL
236+
-st, --symbol_table symbol table file location. STR
221237
--help Show this message and exit.
222238
```
223239

docs/images/data_flow.png

18.7 KB
Loading

inspect4py/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.0.6'
1+
__version__ = '0.0.7'

inspect4py/cli.py

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
1+
import ast
12
import json
23
import tokenize
34
import types
45
import builtins
56
import click
67
from docstring_parser import parse as doc_parse
8+
from tree_sitter import Language, Parser
79

810
from inspect4py import __version__
911
from inspect4py.staticfg import builder
1012
from inspect4py.utils import *
13+
# from utils import *
1114

1215
"""
1316
Code Inspector
@@ -26,7 +29,7 @@
2629

2730

2831
class CodeInspection:
29-
def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abstract_syntax_tree, source_code):
32+
def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abstract_syntax_tree, source_code, data_flow, parser):
3033
""" init method initializes the Code_Inspection object
3134
:param self self: represent the instance of the class
3235
:param str path: the file to inspect
@@ -41,6 +44,8 @@ def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abs
4144
self.out_json_path = out_json_path
4245
self.abstract_syntax_tree = abstract_syntax_tree
4346
self.source_code = source_code
47+
self.data_flow = data_flow
48+
self.parser = parser
4449
self.tree = self.parser_file()
4550
if self.tree != "AST_ERROR":
4651
self.nodes = self.walk()
@@ -51,13 +56,14 @@ def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abs
5156
self.bodyInfo = self.inspect_body()
5257
if control_flow:
5358
self.out_control_flow_path = out_control_flow_path
54-
self.controlFlowInfo = self.inspect_controlflow()
59+
self.controlFlowInfo = self.inspect_controlflow("png")
5560
else:
5661
self.controlFlowInfo = {}
5762
self.fileJson = self.file_json()
5863
else:
5964
self.fileJson = {}
6065

66+
6167
def find_classDef(self):
6268
classDef_nodes = [node for node in self.nodes if isinstance(node, ast.ClassDef)]
6369
class_init=[]
@@ -466,6 +472,13 @@ def file_json(self):
466472
json.dump(prune_json(file_dict), outfile)
467473
return [file_dict, json_file]
468474

475+
# def get_parser_data_flow(self):
476+
# parser = Parser()
477+
# LANGUAGE = Language(self.symbol_table, "python")
478+
# parser.set_language(LANGUAGE)
479+
# parser = [parser, DFG_python]
480+
# return parser
481+
469482
def _f_definitions(self, functions_definitions):
470483
"""_f_definitions extracts the name, args, docstring
471484
returns, raises of a list of functions or a methods.
@@ -477,11 +490,15 @@ def _f_definitions(self, functions_definitions):
477490
:param list functions_definitions: represent a list with all functions or methods nodes
478491
:return dictionary: a dictionary with the all the information at function/method level
479492
"""
480-
493+
# print(functions_definitions)
481494
funcs_info = {}
482495
for f in functions_definitions:
496+
# for node in ast.walk(f):
497+
# print(node.name)
498+
483499
funcs_info[f.name] = {}
484500
ds_f = ast.get_docstring(f)
501+
# print(ds_f)
485502
try:
486503
docstring = doc_parse(ds_f)
487504
funcs_info[f.name]["doc"] = {}
@@ -577,7 +594,10 @@ def _f_definitions(self, functions_definitions):
577594
funcs_info[f.name]["ast"] = ast_to_json(f)
578595
if self.source_code:
579596
funcs_info[f.name]["source_code"] = ast_to_source_code(f)
580-
597+
if self.data_flow:
598+
code_tokens, dfg = extract_dataflow(funcs_info[f.name]["source_code"], self.parser, "python")
599+
funcs_info[f.name]["data_flow"] = dfg
600+
funcs_info[f.name]["code_tokens"] = code_tokens
581601
return funcs_info
582602

583603
def _skip_dynamic_calls(self, funcs_info, classes_info, check_name, name, var_name):
@@ -1204,6 +1224,7 @@ def create_output_dirs(output_dir, control_flow):
12041224
@click.option('-i', '--input_path', type=str, required=True, help="input path of the file or directory to inspect.")
12051225
@click.option('-o', '--output_dir', type=str, default="output_dir",
12061226
help="output directory path to store results. If the directory does not exist, the tool will create it.")
1227+
@click.option('-st','--symbol_table', type=str, default="my_language.so", help="symbol table for the target function")
12071228
@click.option('-ignore_dir', '--ignore_dir_pattern', multiple=True, default=[".", "__pycache__"],
12081229
help="ignore directories starting with a certain pattern. This parameter can be provided multiple times "
12091230
"to ignore multiple directory patterns.")
@@ -1231,16 +1252,35 @@ def create_output_dirs(output_dir, control_flow):
12311252
help="extract all readme files in the target repository.")
12321253
@click.option('-md', '--metadata', type=bool, is_flag=True,
12331254
help="extract metadata of the target repository using Github API. (requires repository to have the .git folder)")
1255+
@click.option('-df', '--data_flow', type=bool, is_flag=True,
1256+
help="extract data flow graph of every function in the target repository")
1257+
12341258
def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requirements, html_output, call_list,
12351259
control_flow, directory_tree, software_invocation, abstract_syntax_tree, source_code, license_detection, readme,
1236-
metadata):
1260+
metadata, data_flow, symbol_table):
1261+
if data_flow:
1262+
if symbol_table == "my_language.so": # default option
1263+
path_to_languages = str(Path(__file__).parent / "resources")
1264+
if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
1265+
language = Language(path_to_languages + os.path.sep + "python_win.so", "python")
1266+
else:
1267+
language = Language(path_to_languages + os.path.sep + "python_unix.so", "python")
1268+
else:
1269+
language = Language(symbol_table, "python")
1270+
parser = Parser()
1271+
parser.set_language(language)
1272+
parser = [parser, DFG_python]
1273+
else:
1274+
parser = []
1275+
1276+
# print(parsers)
12371277
if (not os.path.isfile(input_path)) and (not os.path.isdir(input_path)):
12381278
print('The file or directory specified does not exist')
12391279
sys.exit()
12401280

12411281
if os.path.isfile(input_path):
12421282
cf_dir, json_dir = create_output_dirs(output_dir, control_flow)
1243-
code_info = CodeInspection(input_path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code)
1283+
code_info = CodeInspection(input_path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code, data_flow, parser)
12441284

12451285
# Generate the call list of a file
12461286
call_list_data = call_list_file(code_info)
@@ -1279,18 +1319,20 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir
12791319
for f in files:
12801320
if ".py" in f and not f.endswith(".pyc"):
12811321
try:
1322+
12821323
path = os.path.join(subdir, f)
12831324
relative_path = Path(subdir).relative_to(Path(input_path).parent)
12841325
out_dir = str(Path(output_dir) / relative_path)
12851326
cf_dir, json_dir = create_output_dirs(out_dir, control_flow)
1286-
code_info = CodeInspection(path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code)
1327+
code_info = CodeInspection(path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code, data_flow, parser)
1328+
# print(parsers)
12871329
if code_info.fileJson:
12881330
if out_dir not in dir_info:
12891331
dir_info[out_dir] = [code_info.fileJson[0]]
12901332
else:
12911333
dir_info[out_dir].append(code_info.fileJson[0])
12921334
except:
1293-
print("Error when processing " + f + ": ", sys.exc_info()[0])
1335+
print("Error when processing " + f + ": ", sys.exc_info())
12941336
continue
12951337

12961338
# Generate the call list of the Dir
@@ -1332,7 +1374,7 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir
13321374
dir_info["software_type"] = "not found"
13331375
if license_detection:
13341376
try:
1335-
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "licenses")
1377+
licenses_path = str(Path(__file__).parent / "licenses")
13361378
license_text = extract_license(input_path)
13371379
rank_list = detect_license(license_text, licenses_path)
13381380
dir_info["license"] = {}

inspect4py/parse_setup_files.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ def parse_setup_py(parent_dir):
5656
if single_line:
5757
elem = setup_content[console_index]
5858
cs = elem.split("=")
59-
cs_string = cs[0].strip().replace('\'', '').split('["')[1]
59+
# print(cs)
60+
# print(cs[1].strip())
61+
cs_string = cs[1].strip().replace('\'', '').split('["')[1]
6062
cs_list.append(normalize(cs_string))
6163
setup_info["installation"] = "pip install " + cs_string
6264
setup_info["run"].append(cs_string)
480 KB
Binary file not shown.

inspect4py/resources/python_win.so

399 KB
Binary file not shown.

0 commit comments

Comments
 (0)