Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import argparse
import sys
import codecs
import json
from typing import Any, Dict
from textwrap import dedent
from importlib.metadata import entry_points
from .__about__ import __version__
from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult



def main():
parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.",
Expand Down Expand Up @@ -59,6 +61,13 @@ def main():
"--output",
help="Output file name. If not provided, output is written to stdout.",
)

parser.add_argument(
"--output-format",
choices=["markdown", "json"],
default="markdown",
help="Output format: 'markdown' (default) or structured 'json'."
)

parser.add_argument(
"-x",
Expand Down Expand Up @@ -260,6 +269,15 @@ def main():

def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if getattr(args, "output_format", "markdown") == "json":
# Convert the raw markdown to structured dictionary
structured_data = convert_markdown_to_json(result.markdown)

if args.output:
with open(args.output, "w", encoding="utf-8") as f:
json.dump(structured_data, f, indent=2, ensure_ascii=False)
else:
print(json.dumps(structured_data, indent=2, ensure_ascii=False))
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.markdown)
Expand Down
Empty file.
40 changes: 40 additions & 0 deletions utilities/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import json
import re

def convert_markdown_to_json(md_text: str) -> dict:
"""
Parses Markdown text and structures it into a JSON-serializable dictionary
containing a title and sections.
"""
lines = md_text.splitlines()
title = "Untitled Document"
sections = []

current_heading = "Introduction"
current_content = []

for line in lines:
if line.startswith("# "):
title = line[2:].strip()
elif re.match(r"^#{2,} ", line):
# Save the previous section before starting a new one
if current_content or current_heading != "Introduction":
sections.append({
"heading": current_heading,
"content": "\n".join(current_content).strip()
})
current_heading = re.sub(r"^#{2,} ", "", line).strip()
current_content = []
else:
current_content.append(line)

if current_content:
sections.append({
"heading": current_heading,
"content": "\n".join(current_content).strip()
})

return {
"title": title,
"sections": sections
}