Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions packages/markitdown/src/markitdown/converters/_csv_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv",
"application/csv",
"text/tab-separated-values",
"text/tsv",
]
ACCEPTED_FILE_EXTENSIONS = [".csv"]
ACCEPTED_FILE_EXTENSIONS = [".csv", ".tsv"]

SNIFF_SAMPLE_SIZE = 8192


class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown tables.
Converts CSV and TSV files to Markdown tables.
"""

def __init__(self):
Expand Down Expand Up @@ -47,8 +51,12 @@ def convert(
else:
content = str(from_bytes(file_stream.read()).best())

# Parse CSV content
reader = csv.reader(io.StringIO(content))
# Auto-detect the delimiter
extension = (stream_info.extension or "").lower()
delimiter = self._detect_delimiter(content, extension)

# Parse content
reader = csv.reader(io.StringIO(content), delimiter=delimiter)
rows = list(reader)

if not rows:
Expand All @@ -57,8 +65,9 @@ def convert(
# Create markdown table
markdown_table = []

# Add header row
markdown_table.append("| " + " | ".join(rows[0]) + " |")
# Add header row (with pipe escaping)
header = [self._escape_cell(cell) for cell in rows[0]]
markdown_table.append("| " + " | ".join(header) + " |")

# Add separator row
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
Expand All @@ -70,8 +79,24 @@ def convert(
row.append("")
# Truncate if row has more columns than header
row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |")
escaped = [self._escape_cell(cell) for cell in row]
markdown_table.append("| " + " | ".join(escaped) + " |")

result = "\n".join(markdown_table)

return DocumentConverterResult(markdown=result)

def _detect_delimiter(self, content: str, extension: str) -> str:
"""Auto-detect the delimiter using csv.Sniffer, with sensible fallbacks."""
try:
sample = content[:SNIFF_SAMPLE_SIZE]
dialect = csv.Sniffer().sniff(sample)
return dialect.delimiter
except csv.Error:
if extension == ".tsv":
return "\t"
return ","

def _escape_cell(self, cell: str) -> str:
"""Escape characters that would break a Markdown table."""
return cell.replace("|", "\\|").replace("\n", " ").replace("\r", "")
14 changes: 14 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,20 @@ class FileTestVector(object):
],
must_not_include=[],
),
FileTestVector(
filename="test.tsv",
mimetype="text/tsv",
charset="ascii",
url=None,
must_include=[
"| Name | Age | City | Notes |",
"| --- | --- | --- | --- |",
"| Alice | 30 | New York | Likes coffee |",
"| Bob | 25 | San Francisco | Uses pipes \\| often |",
"| Charlie | 35 | Chicago | N/A |",
],
must_not_include=[],
),
FileTestVector(
filename="test.json",
mimetype="application/json",
Expand Down
4 changes: 4 additions & 0 deletions packages/markitdown/tests/test_files/test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name Age City Notes
Alice 30 New York Likes coffee
Bob 25 San Francisco Uses pipes | often
Charlie 35 Chicago N/A