From 910c30853522e96382206bb33889bf824f7b1a52 Mon Sep 17 00:00:00 2001
From: Ville Laitila <ville.laitila@softagram.com>
Date: Tue, 16 Jun 2026 23:35:37 +0300
Subject: [PATCH] feat(cli): add model comparison CLI and correct comparison
 docs

Add a shell entry point for comparing two sgraph models:

    python -m sgraph.cli.compare MODEL_A MODEL_B [options]

Modeled on the existing sgraph.cypher CLI (argparse, status to stderr).
Options: -f/--format {text,json}, -o/--output FILE, --rename-detection,
--exclude-attrs a,b,c. Output is either the human-readable
printCompareInfos() summary (text) or pretty-printed JSON listing added,
removed and changed elements and dependencies.

Exit codes follow git diff conventions so the command is usable as a
change gate in scripts/CI:
    0  no differences
    1  differences found
    2  error (bad path, parse failure, usage error)

Also correct the comparison documentation, which described an API that
does not exist (compare() returning a dict, a calculateSimilarity()
method, and an Example 9 that called .get() on the result). The docs now
match the real ModelCompare API: compare()/compareModels() return an
SGraph, and getCompareInfos()/printCompareInfos() extract structured
results.

Implementation is split into a testable run(argv) -> int core and a thin
main(); 7 tests cover both output formats, the three exit codes,
--exclude-attrs and --rename-detection.
---
 README.md                     |  22 +++++
 docs/api-reference.md         |  91 +++++++++++++++++++--
 docs/examples.md              |  69 +++++++++-------
 src/sgraph/cli/compare.py     | 146 ++++++++++++++++++++++++++++++++++
 tests/cli/test_compare_cli.py | 116 +++++++++++++++++++++++++++
 5 files changed, 408 insertions(+), 36 deletions(-)
 create mode 100644 src/sgraph/cli/compare.py
 create mode 100644 tests/cli/test_compare_cli.py

diff --git a/README.md b/README.md
index dedbe84..5fad2c1 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,28 @@ python -m sgraph.cypher model.xml.zip -f dot 'MATCH (a)-[r]->(b) RETURN a, r, b'
 
 See the [Cypher documentation](https://softagram.github.io/sgraph/cypher.html) for full details and query examples.
 
+### Comparing models
+
+Two models can be compared to see what was added, removed, or changed:
+
+```python
+from sgraph.compare.modelcompare import ModelCompare
+
+mc = ModelCompare()
+compare_model = mc.compare('old_model.xml', 'new_model.xml')  # returns an SGraph
+mc.printCompareInfos(compare_model)
+```
+
+A CLI is also available (exit codes follow `git diff`: `0` = no differences, `1` = differences, `2` = error):
+
+```bash
+python -m sgraph.cli.compare old_model.xml new_model.xml            # human-readable summary
+python -m sgraph.cli.compare old_model.xml new_model.xml -f json    # machine-readable JSON
+python -m sgraph.cli.compare old_model.xml new_model.xml --rename-detection
+```
+
+See the [API reference](https://softagram.github.io/sgraph/api-reference.html#comparison) for the full comparison API.
+
 ## Current utilization
 [Softagram](https://github.com/softagram) uses it for building up the information model about the 
 analyzed software.
diff --git a/docs/api-reference.md b/docs/api-reference.md
index 759e2b5..6691eef 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -380,16 +380,59 @@ filtering = SGraphFiltering(model)
 from sgraph.compare.modelcompare import ModelCompare
 
 comparer = ModelCompare()
-result = comparer.compare('old_model.xml', 'new_model.xml')
+# Returns a *compare model* (an SGraph), not a plain dict.
+compare_model = comparer.compare('old_model.xml', 'new_model.xml')
 ```
 
-#### Methods
+`compare()` / `compareModels()` return a new `SGraph` (the "compare model") in
+which differences are annotated as element/association attributes (`compare`,
+`_only_in`, `_changed_dep`, `_change_count`, `_attr_diff`, ...). Use the
+`getCompareInfos()` / `printCompareInfos()` helpers, or the individual
+extractors below, to turn that compare model into structured results.
+
+#### Building the compare model
+
+##### `compare(path1: str, path2: str, exclude_attrs: set[str] | None = None) -> SGraph`
+Loads two models from XML (or zipped XML) file paths and compares them.
+
+##### `compareModels(model1: SGraph, model2: SGraph, rename_detection: bool = False, exclude_attrs: set[str] | None = None) -> SGraph`
+Compares two already-loaded in-memory models.
+
+`exclude_attrs` is a set of attribute names to ignore during comparison. The
+preset `SLIDING_WINDOW_ATTRS` (from `sgraph.compare.compareutils`) suppresses
+time-windowed metric noise (author/commit/bug counts, `last_modified`, etc.):
+
+```python
+from sgraph.compare.compareutils import SLIDING_WINDOW_ATTRS
 
-##### `compare(old_model: str, new_model: str) -> Dict`
-Compares two models and returns differences.
+compare_model = comparer.compare('a.xml', 'b.xml', exclude_attrs=SLIDING_WINDOW_ATTRS)
+```
+
+#### Reading the results
+
+##### `getCompareInfos(compare_model: SGraph) -> tuple`
+Returns a 6-tuple:
+`(new_deps, removed_deps, changed_elems, new_elems, removed_elems, attr_changes)`.
+
+##### `printCompareInfos(compare_model: SGraph) -> tuple`
+Prints a human-readable summary and returns the same 6-tuple as `getCompareInfos()`.
 
-##### `calculateSimilarity(old_model: str, new_model: str) -> float`
-Calculates similarity score between models.
+The tuple elements are:
+
+| Field | Shape | Meaning |
+|-------|-------|---------|
+| `new_deps` | `list[(SElementAssociation, int)]` | Added dependencies with dependency length, longest first |
+| `removed_deps` | `list[(SElementAssociation, int)]` | Removed dependencies with dependency length, longest first |
+| `changed_elems` | `list[(SElement, int)]` | Elements with a change count, highest first |
+| `new_elems` | `list[(str, SElement)]` | Added elements as `("parent/name", element)` |
+| `removed_elems` | `list[(str, SElement)]` | Removed elements as `("parent/name", element)` |
+| `attr_changes` | `list[(SElement, str)]` | Elements whose attributes changed, with the diff string |
+
+Individual extractors are also available: `newAndRemovedElems()`,
+`newAndRemovedDependenciesLists()`, `elemsWithChanges()`,
+`elemsWithAttrChanges()`, `uniqueConnectionsCreated()`,
+`uniqueConnectionsRemoved()`, `externalChanges()`, and
+`getElementsWithAttrDiff(compare_model, attribute)`.
 
 ## CLI Tools
 
@@ -420,6 +463,42 @@ Options:
 - `--type TYPE` - Filter by element type
 - `--output FILE` - Output file path
 
+### compare
+
+Compare two models and report the differences (added/removed/changed elements
+and dependencies). `MODEL_A` is the "before"/old model, `MODEL_B` the
+"after"/new model.
+
+```bash
+# Human-readable summary
+python -m sgraph.cli.compare old_model.xml new_model.xml
+
+# Machine-readable, pretty-printed JSON
+python -m sgraph.cli.compare old_model.xml new_model.xml -f json
+```
+
+Options:
+- `-f, --format {text,json}` - Output format (default: `text`; `text` reuses `ModelCompare.printCompareInfos()`)
+- `-o, --output FILE` - Write output to a file instead of stdout
+- `--rename-detection` - Detect renamed elements (collapses an add+remove into a single changed element annotated with `old_name`)
+- `--exclude-attrs a,b,c` - Comma-separated attribute names to ignore during comparison
+
+Exit codes follow `git diff` conventions:
+
+| Code | Meaning |
+|------|---------|
+| `0` | Models are equivalent (no differences) |
+| `1` | Differences were found |
+| `2` | Error (bad path, parse failure, or usage error) |
+
+This makes it usable as a change gate in scripts/CI:
+
+```bash
+if ! python -m sgraph.cli.compare before.xml after.xml -f json -o diff.json; then
+    echo "Model changed — see diff.json"
+fi
+```
+
 ## Exceptions
 
 ### SElementMergedException
diff --git a/docs/examples.md b/docs/examples.md
index e57771f..54d39b2 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -463,47 +463,56 @@ def create_interactive_visualization(model_path):
 
 ```python
 from sgraph.compare.modelcompare import ModelCompare
-from sgraph.modelapi import ModelApi
 
 def compare_model_versions(old_model_path, new_model_path):
     """
-    Compare two versions of a model to track changes
+    Compare two versions of a model to track changes.
+
+    ModelCompare.compare() returns a *compare model* (an SGraph) with the
+    differences annotated as attributes. getCompareInfos() turns that compare
+    model into structured lists.
     """
     comparer = ModelCompare()
-    comparison = comparer.compare(old_model_path, new_model_path)
-    
+    compare_model = comparer.compare(old_model_path, new_model_path)
+
+    # getCompareInfos returns a 6-tuple. Note the order:
+    # (new_deps, removed_deps, changed_elems, new_elems, removed_elems, attr_changes)
+    (new_deps, removed_deps, changed_elems,
+     new_elems, removed_elems, attr_changes) = comparer.getCompareInfos(compare_model)
+
     print("=== Model Evolution Analysis ===")
     print(f"Old model: {old_model_path}")
     print(f"New model: {new_model_path}")
     print("=" * 40)
-    
-    # Analyze changes
-    added_elements = comparison.get('added_elements', [])
-    removed_elements = comparison.get('removed_elements', [])
-    modified_elements = comparison.get('modified_elements', [])
-    
-    print(f"📈 Added elements: {len(added_elements)}")
-    for elem in added_elements[:5]:
-        print(f"   + {elem}")
-    if len(added_elements) > 5:
-        print(f"   ... and {len(added_elements) - 5} more")
-    
-    print(f"\n📉 Removed elements: {len(removed_elements)}")
-    for elem in removed_elements[:5]:
-        print(f"   - {elem}")
-    if len(removed_elements) > 5:
-        print(f"   ... and {len(removed_elements) - 5} more")
-    
-    print(f"\n🔄 Modified elements: {len(modified_elements)}")
-    for elem in modified_elements[:5]:
-        print(f"   ~ {elem}")
-    if len(modified_elements) > 5:
-        print(f"   ... and {len(modified_elements) - 5} more")
-    
-    return comparison
+
+    # new_elems / removed_elems are lists of ("parent/name", SElement) tuples.
+    print(f"📈 Added elements: {len(new_elems)}")
+    for label, elem in new_elems[:5]:
+        print(f"   + {elem.getPath()}")
+    if len(new_elems) > 5:
+        print(f"   ... and {len(new_elems) - 5} more")
+
+    print(f"\n📉 Removed elements: {len(removed_elems)}")
+    for label, elem in removed_elems[:5]:
+        print(f"   - {elem.getPath()}")
+    if len(removed_elems) > 5:
+        print(f"   ... and {len(removed_elems) - 5} more")
+
+    # changed_elems is a list of (SElement, change_count) tuples.
+    print(f"\n🔄 Changed elements: {len(changed_elems)}")
+    for elem, change_count in changed_elems[:5]:
+        print(f"   ~ {elem.getPath()} ({change_count} changes)")
+    if len(changed_elems) > 5:
+        print(f"   ... and {len(changed_elems) - 5} more")
+
+    # new_deps / removed_deps are lists of (SElementAssociation, length) tuples.
+    print(f"\n🔗 Added dependencies: {len(new_deps)},"
+          f" removed: {len(removed_deps)}")
+
+    return compare_model
 
 # Usage
-# evolution = compare_model_versions('v1.0_model.xml', 'v2.0_model.xml')
+# compare_model = compare_model_versions('v1.0_model.xml', 'v2.0_model.xml')
 ```
 
 ### Example 10: Custom Metrics Calculation
diff --git a/src/sgraph/cli/compare.py b/src/sgraph/cli/compare.py
new file mode 100644
index 0000000..5249157
--- /dev/null
+++ b/src/sgraph/cli/compare.py
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+import argparse
+import contextlib
+import json
+import sys
+
+from sgraph import SGraph
+from sgraph.compare.modelcompare import ModelCompare
+
+"""
+Compare two sgraph models and report the differences.
+
+Usage:
+    python -m sgraph.cli.compare MODEL_A MODEL_B [options]
+
+    MODEL_A is the "before"/old model, MODEL_B the "after"/new model
+    (paths to .xml or .xml.zip files).
+
+Exit codes (git-diff style):
+    0  models are equivalent (no differences)
+    1  differences were found
+    2  an error occurred (bad path, parse failure, usage error)
+"""
+
+
+def _build_payload(model_a: str, model_b: str, infos) -> dict:
+    """Turn ModelCompare.getCompareInfos() output into a JSON-friendly dict."""
+    (new_deps, removed_deps, changed_elems, new_elems, removed_elems,
+     attr_changes) = infos
+
+    def dep_entry(item):
+        association, length = item
+        return {
+            'from': association.fromElement.getPath(),
+            'to': association.toElement.getPath(),
+            'deptype': association.deptype,
+            'length': length,
+        }
+
+    def changed_entry(item):
+        elem, change_count = item
+        entry = {'path': elem.getPath(), 'change_count': int(change_count)}
+        if elem.attrs.get('renamed') == 'true' and 'old_name' in elem.attrs:
+            entry['old_name'] = elem.attrs['old_name']
+        return entry
+
+    payload = {
+        'model_a': model_a,
+        'model_b': model_b,
+        'new_elements': [{'path': e.getPath()} for _, e in new_elems],
+        'removed_elements': [{'path': e.getPath()} for _, e in removed_elems],
+        'changed_elements': [changed_entry(c) for c in changed_elems],
+        'new_dependencies': [dep_entry(d) for d in new_deps],
+        'removed_dependencies': [dep_entry(d) for d in removed_deps],
+        # Drop entries whose diff is empty: they carry no information (e.g. an
+        # element whose only differing attribute was excluded via --exclude-attrs).
+        'attr_changes': [{'path': e.getPath(), 'diff': d}
+                         for e, d in attr_changes if d],
+    }
+    payload['summary'] = {
+        'new_elements': len(payload['new_elements']),
+        'removed_elements': len(payload['removed_elements']),
+        'changed_elements': len(payload['changed_elements']),
+        'new_dependencies': len(payload['new_dependencies']),
+        'removed_dependencies': len(payload['removed_dependencies']),
+        'attr_changes': len(payload['attr_changes']),
+    }
+    return payload
+
+
+def _write_output(payload: dict, compare_model: SGraph, mc: ModelCompare,
+                  output_format: str, output: str | None):
+    stream = open(output, 'w') if output else sys.stdout
+    try:
+        if output_format == 'json':
+            stream.write(json.dumps(payload, indent=2) + '\n')
+        else:
+            # Reuse the library's human-readable summary printer.
+            with contextlib.redirect_stdout(stream):
+                mc.printCompareInfos(compare_model)
+    finally:
+        if output:
+            stream.close()
+
+
+def _parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog='python -m sgraph.cli.compare',
+        description='Compare two sgraph models and report the differences.')
+    parser.add_argument('model_a',
+                        help='Path to the "before"/old model (.xml or .xml.zip)')
+    parser.add_argument('model_b',
+                        help='Path to the "after"/new model (.xml or .xml.zip)')
+    parser.add_argument('-f', '--format', default='text', choices=['text', 'json'],
+                        help='Output format (default: text)')
+    parser.add_argument('-o', '--output', default=None, metavar='FILE',
+                        help='Write output to FILE instead of stdout')
+    parser.add_argument('--rename-detection', action='store_true',
+                        help='Detect renamed elements (collapses an add+remove '
+                        'into a single changed element annotated with old_name)')
+    parser.add_argument('--exclude-attrs', default=None, metavar='a,b,c',
+                        help='Comma-separated attribute names to ignore during '
+                        'comparison')
+    return parser.parse_args(argv)
+
+
+def run(argv: list[str]) -> int:
+    """Run the comparison. Returns the process exit code (0/1/2)."""
+    args = _parse_args(argv)
+
+    exclude_attrs = None
+    if args.exclude_attrs:
+        exclude_attrs = {name.strip() for name in args.exclude_attrs.split(',')
+                         if name.strip()}
+
+    mc = ModelCompare()
+    try:
+        if args.rename_detection:
+            # rename_detection lives on compareModels, so load the models first.
+            model1 = SGraph.parse_xml_or_zipped_xml(args.model_a)
+            model2 = SGraph.parse_xml_or_zipped_xml(args.model_b)
+            compare_model = mc.compareModels(model1, model2, rename_detection=True,
+                                             exclude_attrs=exclude_attrs)
+        else:
+            compare_model = mc.compare(args.model_a, args.model_b,
+                                       exclude_attrs=exclude_attrs)
+    except Exception as e:  # noqa: BLE001 - surface any load/parse failure as exit 2
+        print(f'Error: {e}', file=sys.stderr)
+        return 2
+
+    infos = mc.getCompareInfos(compare_model)
+    payload = _build_payload(args.model_a, args.model_b, infos)
+    has_diff = any(payload['summary'].values())
+
+    _write_output(payload, compare_model, mc, args.format, args.output)
+
+    return 1 if has_diff else 0
+
+
+def main():
+    sys.exit(run(sys.argv[1:]))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/cli/test_compare_cli.py b/tests/cli/test_compare_cli.py
new file mode 100644
index 0000000..288538d
--- /dev/null
+++ b/tests/cli/test_compare_cli.py
@@ -0,0 +1,116 @@
+"""Tests for the `python -m sgraph.cli.compare` command-line entry point."""
+import json
+
+from sgraph import SGraph, SElement
+from sgraph.cli import compare as compare_cli
+
+MODEL_A = 'tests/modelfile.xml'
+MODEL_B = 'tests/modelfile_direct_indirect.xml'
+
+
+def _write_file_model(path, attrs):
+    """Write a tiny one-file model with the given attrs on /proj/src/file.py."""
+    m = SGraph(SElement(None, ''))
+    e = m.createOrGetElementFromPath('/proj/src/file.py')
+    for k, v in attrs.items():
+        e.addAttribute(k, v)
+    m.to_xml(path)
+
+
+def _write_single_child(path, name):
+    """Write a model whose only leaf is /p/src/<name>."""
+    m = SGraph(SElement(None, ''))
+    m.createOrGetElementFromPath('/p/src/' + name)
+    m.to_xml(path)
+
+
+def test_text_output_prints_summary_and_exits_one_on_difference(capsys):
+    code = compare_cli.run([MODEL_A, MODEL_B])
+    out = capsys.readouterr().out
+    assert code == 1  # git-diff style: differences found
+    assert 'New elements' in out
+    assert 'Removed elements' in out
+
+
+def test_json_output_reports_correct_counts(capsys):
+    code = compare_cli.run([MODEL_A, MODEL_B, '-f', 'json'])
+    out = capsys.readouterr().out
+    assert code == 1
+    data = json.loads(out)
+    s = data['summary']
+    assert s['new_elements'] == 6
+    assert s['removed_elements'] == 26
+    assert s['new_dependencies'] == 4
+    assert s['removed_dependencies'] == 6
+    assert s['changed_elements'] == 2
+    # summary counts must match the list lengths
+    assert len(data['new_elements']) == s['new_elements']
+    assert len(data['removed_dependencies']) == s['removed_dependencies']
+    # dependency entries carry from/to/deptype/length
+    dep = data['removed_dependencies'][0]
+    assert {'from', 'to', 'deptype', 'length'} <= set(dep)
+    assert data['model_a'] == MODEL_A and data['model_b'] == MODEL_B
+
+
+def test_identical_models_exit_zero_with_empty_summary(capsys):
+    code = compare_cli.run([MODEL_A, MODEL_A, '-f', 'json'])
+    out = capsys.readouterr().out
+    assert code == 0
+    data = json.loads(out)
+    assert all(v == 0 for v in data['summary'].values())
+
+
+def test_missing_file_exits_two(capsys):
+    code = compare_cli.run(['tests/does_not_exist_xyz.xml', MODEL_B])
+    err = capsys.readouterr().err
+    assert code == 2
+    assert 'rror' in err  # "Error: ..."
+
+
+def test_exclude_attrs_suppresses_attribute(tmp_path, capsys):
+    a = str(tmp_path / 'a.xml')
+    b = str(tmp_path / 'b.xml')
+    _write_file_model(a, {'hash': 'same', 'commit_count_30': '5'})
+    _write_file_model(b, {'hash': 'same', 'commit_count_30': '15'})
+
+    # Without exclude: the attribute change is reported, exit 1
+    code = compare_cli.run([a, b, '-f', 'json'])
+    data = json.loads(capsys.readouterr().out)
+    assert code == 1
+    diffs = ' '.join(e['diff'] for e in data['attr_changes'])
+    assert 'commit_count_30' in diffs
+
+    # With exclude: no meaningful change remains, exit 0 and no attr_changes
+    code = compare_cli.run([a, b, '-f', 'json', '--exclude-attrs', 'commit_count_30'])
+    data = json.loads(capsys.readouterr().out)
+    assert code == 0
+    assert data['attr_changes'] == []
+
+
+def test_rename_detection_collapses_add_remove_into_change(tmp_path, capsys):
+    a = str(tmp_path / 'a.xml')
+    b = str(tmp_path / 'b.xml')
+    _write_single_child(a, 'alpha.py')
+    _write_single_child(b, 'beta.py')
+
+    # Without rename detection: alpha removed, beta added
+    compare_cli.run([a, b, '-f', 'json'])
+    data = json.loads(capsys.readouterr().out)
+    assert any(e['path'].endswith('alpha.py') for e in data['removed_elements'])
+    assert any(e['path'].endswith('beta.py') for e in data['new_elements'])
+
+    # With rename detection: collapsed into a changed element carrying old_name
+    compare_cli.run([a, b, '-f', 'json', '--rename-detection'])
+    data = json.loads(capsys.readouterr().out)
+    assert data['removed_elements'] == []
+    assert data['new_elements'] == []
+    assert any(e.get('old_name') == 'alpha.py' for e in data['changed_elements'])
+
+
+def test_output_flag_writes_to_file_not_stdout(tmp_path, capsys):
+    out_file = tmp_path / 'out.json'
+    code = compare_cli.run([MODEL_A, MODEL_B, '-f', 'json', '-o', str(out_file)])
+    assert code == 1
+    assert capsys.readouterr().out.strip() == ''  # nothing on stdout
+    data = json.loads(out_file.read_text())
+    assert data['summary']['new_elements'] == 6