diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..c9b6e4c58 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -8,8 +8,9 @@ ACCEPTED_MIME_TYPE_PREFIXES = [ "text/csv", "application/csv", + "text/tab-separated-values", ] -ACCEPTED_FILE_EXTENSIONS = [".csv"] +ACCEPTED_FILE_EXTENSIONS = [".csv", ".tsv"] class CsvConverter(DocumentConverter): @@ -48,7 +49,12 @@ def convert( content = str(from_bytes(file_stream.read()).best()) # Parse CSV content - reader = csv.reader(io.StringIO(content)) + delimiter = "\t" if (stream_info.extension or "").lower() == ".tsv" else "," + + reader = csv.reader( + io.StringIO(content), + delimiter=delimiter + ) rows = list(reader) if not rows: diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..53e05402f 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -152,6 +152,19 @@ class FileTestVector(object): ], must_not_include=[], ), + FileTestVector( + filename="test_sample.tsv", + mimetype="text/tab-separated-values", + charset="utf-8", + url=None, + must_include=[ + "| Name | Age | City |", + "| --- | --- | --- |", + "| Rahul | 20 | Delhi |", + "| Priya | 21 | Noida |", + ], + must_not_include=[], + ), FileTestVector( filename="test.json", mimetype="application/json", diff --git a/packages/markitdown/tests/test_files/test_sample.tsv b/packages/markitdown/tests/test_files/test_sample.tsv new file mode 100644 index 000000000..c7af4e4cd --- /dev/null +++ b/packages/markitdown/tests/test_files/test_sample.tsv @@ -0,0 +1,3 @@ +Name Age City +Rahul 20 Delhi +Priya 21 Noida