diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..d1d9441d0 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -8,13 +8,17 @@ ACCEPTED_MIME_TYPE_PREFIXES = [ "text/csv", "application/csv", + "text/tab-separated-values", + "text/tsv", ] -ACCEPTED_FILE_EXTENSIONS = [".csv"] +ACCEPTED_FILE_EXTENSIONS = [".csv", ".tsv"] + +SNIFF_SAMPLE_SIZE = 8192 class CsvConverter(DocumentConverter): """ - Converts CSV files to Markdown tables. + Converts CSV and TSV files to Markdown tables. """ def __init__(self): @@ -47,8 +51,12 @@ def convert( else: content = str(from_bytes(file_stream.read()).best()) - # Parse CSV content - reader = csv.reader(io.StringIO(content)) + # Auto-detect the delimiter + extension = (stream_info.extension or "").lower() + delimiter = self._detect_delimiter(content, extension) + + # Parse content + reader = csv.reader(io.StringIO(content), delimiter=delimiter) rows = list(reader) if not rows: @@ -57,8 +65,9 @@ def convert( # Create markdown table markdown_table = [] - # Add header row - markdown_table.append("| " + " | ".join(rows[0]) + " |") + # Add header row (with pipe escaping) + header = [self._escape_cell(cell) for cell in rows[0]] + markdown_table.append("| " + " | ".join(header) + " |") # Add separator row markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") @@ -70,8 +79,24 @@ def convert( row.append("") # Truncate if row has more columns than header row = row[: len(rows[0])] - markdown_table.append("| " + " | ".join(row) + " |") + escaped = [self._escape_cell(cell) for cell in row] + markdown_table.append("| " + " | ".join(escaped) + " |") result = "\n".join(markdown_table) return DocumentConverterResult(markdown=result) + + def _detect_delimiter(self, content: str, extension: str) -> str: + """Auto-detect the delimiter using csv.Sniffer, with sensible fallbacks.""" + try: + sample = content[:SNIFF_SAMPLE_SIZE] + dialect = csv.Sniffer().sniff(sample) + return dialect.delimiter + except csv.Error: + if extension == ".tsv": + return "\t" + return "," + + def _escape_cell(self, cell: str) -> str: + """Escape characters that would break a Markdown table.""" + return cell.replace("|", "\\|").replace("\n", " ").replace("\r", "") diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..d47b47d8b 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -152,6 +152,20 @@ class FileTestVector(object): ], must_not_include=[], ), + FileTestVector( + filename="test.tsv", + mimetype="text/tsv", + charset="ascii", + url=None, + must_include=[ + "| Name | Age | City | Notes |", + "| --- | --- | --- | --- |", + "| Alice | 30 | New York | Likes coffee |", + "| Bob | 25 | San Francisco | Uses pipes \\| often |", + "| Charlie | 35 | Chicago | N/A |", + ], + must_not_include=[], + ), FileTestVector( filename="test.json", mimetype="application/json", diff --git a/packages/markitdown/tests/test_files/test.tsv b/packages/markitdown/tests/test_files/test.tsv new file mode 100644 index 000000000..68a311963 --- /dev/null +++ b/packages/markitdown/tests/test_files/test.tsv @@ -0,0 +1,4 @@ +Name Age City Notes +Alice 30 New York Likes coffee +Bob 25 San Francisco Uses pipes | often +Charlie 35 Chicago N/A