From bf9ac43bdfe309b86835ff9437fbe58f3e3bae60 Mon Sep 17 00:00:00 2001 From: Aadi <133583698+Aadi-011@users.noreply.github.com> Date: Sun, 31 May 2026 19:09:13 +0000 Subject: [PATCH 1/2] feat: add JSON output format option --- .../markitdown/src/markitdown/__main__.py | 18 +++++++++ packages/markitdown/src/markitdown/py.typed | 0 utilities/utils.py | 40 +++++++++++++++++++ 3 files changed, 58 insertions(+) delete mode 100644 packages/markitdown/src/markitdown/py.typed create mode 100644 utilities/utils.py diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ccb44b64b..662428385 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -4,6 +4,7 @@ import argparse import sys import codecs +import json from typing import Any, Dict from textwrap import dedent from importlib.metadata import entry_points @@ -11,6 +12,7 @@ from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult + def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", @@ -59,6 +61,13 @@ def main(): "--output", help="Output file name. If not provided, output is written to stdout.", ) + + parser.add_argument( + "--output-format", + choices=["markdown", "json"], + default="markdown", + help="Output format: 'markdown' (default) or structured 'json'." + ) parser.add_argument( "-x", @@ -260,6 +269,15 @@ def main(): def _handle_output(args, result: DocumentConverterResult): """Handle output to stdout or file""" + if getattr(args, "output_format", "markdown") == "json": + # Convert the raw markdown to structured dictionary + structured_data = convert_markdown_to_json(result.markdown) + + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + json.dump(structured_data, f, indent=2, ensure_ascii=False) + else: + print(json.dumps(structured_data, indent=2, ensure_ascii=False)) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) diff --git a/packages/markitdown/src/markitdown/py.typed b/packages/markitdown/src/markitdown/py.typed deleted file mode 100644 index e69de29bb..000000000 diff --git a/utilities/utils.py b/utilities/utils.py new file mode 100644 index 000000000..d41a8217f --- /dev/null +++ b/utilities/utils.py @@ -0,0 +1,40 @@ +import json +import re + +def convert_markdown_to_json(md_text: str) -> dict: + """ + Parses Markdown text and structures it into a JSON-serializable dictionary + containing a title and sections. + """ + lines = md_text.splitlines() + title = "Untitled Document" + sections = [] + + current_heading = "Introduction" + current_content = [] + + for line in lines: + if line.startswith("# "): + title = line[2:].strip() + elif re.match(r"^#{2,} ", line): + # Save the previous section before starting a new one + if current_content or current_heading != "Introduction": + sections.append({ + "heading": current_heading, + "content": "\n".join(current_content).strip() + }) + current_heading = re.sub(r"^#{2,} ", "", line).strip() + current_content = [] + else: + current_content.append(line) + + if current_content: + sections.append({ + "heading": current_heading, + "content": "\n".join(current_content).strip() + }) + + return { + "title": title, + "sections": sections + } \ No newline at end of file From 49d7260c069719045573d84a75e04173135773bb Mon Sep 17 00:00:00 2001 From: Aadi <133583698+Aadi-011@users.noreply.github.com> Date: Sun, 31 May 2026 19:13:18 +0000 Subject: [PATCH 2/2] Initial commit