Skip to content

Commit 224939c

Browse files
committed
fix: improve markitdown parser consistency and add real-file tests
Critical fixes: - WordParser: preserve table position in document order (was appending all tables at end, losing context). Walk document body XML in order instead of iterating paragraphs then tables separately. - PowerPointParser: replace magic number (type == 1) with proper PP_PLACEHOLDER enum constants, also handle CENTER_TITLE. - AudioParser: add Vorbis/FLAC/OGG tag extraction (previously only handled ID3 and MP4 formats). Tries all format mappings with dedup. - ZipParser: replace emoji in tree view with plain text markers for robustness in text processing pipelines. - TextParser: set parser_name='TextParser' on parse_content results for consistency with all other parsers. - __init__.py: export all new parser classes for public API. Tests (16 new, 39 total): - Real .docx/.xlsx/.pptx file creation and parsing - EPub HTML-to-markdown conversion edge cases - ZIP bad-file error handling and no-emoji tree view - AudioParser Vorbis tag extraction and edge cases - WordParser can_parse() extension matching
1 parent 8f4d1b0 commit 224939c

File tree

8 files changed

+385
-66
lines changed

8 files changed

+385
-66
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,32 @@
11
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
from .audio import AudioParser
45
from .base_parser import BaseParser
56
from .code import CodeRepositoryParser
7+
from .epub import EPubParser
8+
from .excel import ExcelParser
69
from .html import HTMLParser, URLType, URLTypeDetector
710
from .markdown import MarkdownParser
811
from .pdf import PDFParser
12+
from .powerpoint import PowerPointParser
913
from .text import TextParser
14+
from .word import WordParser
15+
from .zip_parser import ZipParser
1016

1117
__all__ = [
18+
"AudioParser",
1219
"BaseParser",
1320
"CodeRepositoryParser",
21+
"EPubParser",
22+
"ExcelParser",
1423
"HTMLParser",
1524
"URLType",
1625
"URLTypeDetector",
1726
"MarkdownParser",
1827
"PDFParser",
28+
"PowerPointParser",
1929
"TextParser",
30+
"WordParser",
31+
"ZipParser",
2032
]

openviking/parse/parsers/audio.py

Lines changed: 74 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -131,54 +131,82 @@ def _convert_to_markdown(self, path: Path) -> str:
131131
return "\n\n".join(markdown_parts)
132132

133133
def _extract_tags(self, audio) -> dict:
134-
"""Extract metadata tags from audio file."""
134+
"""Extract metadata tags from audio file.
135+
136+
Handles ID3 (MP3), MP4/M4A, and Vorbis/FLAC/OGG tag formats
137+
via mutagen's format-specific tag keys.
138+
"""
135139
tags = {}
136140

137-
if hasattr(audio, "tags") and audio.tags:
138-
tag_mapping = {
139-
"TIT2": "Title",
140-
"TPE1": "Artist",
141-
"TALB": "Album",
142-
"TCON": "Genre",
143-
"TYER": "Year",
144-
"TDRC": "Date",
145-
"TRCK": "Track Number",
146-
"COMM": "Comments",
147-
"TPE2": "Album Artist",
148-
"TPUB": "Publisher",
149-
"TCOM": "Composer",
150-
}
151-
152-
for key, label in tag_mapping.items():
153-
if key in audio.tags:
154-
try:
155-
value = str(audio.tags[key])
156-
if value:
157-
tags[label] = value
158-
except Exception:
159-
pass
160-
161-
# For MP4/M4A files
162-
if hasattr(audio.tags, "_DictProxy__dict"):
163-
mp4_mapping = {
164-
"\xa9nam": "Title",
165-
"\xa9ART": "Artist",
166-
"\xa9alb": "Album",
167-
"\xa9gen": "Genre",
168-
"\xa9day": "Year",
169-
"trkn": "Track Number",
170-
}
171-
for key, label in mp4_mapping.items():
172-
if key in audio.tags:
173-
try:
174-
value = (
175-
audio.tags[key][0]
176-
if isinstance(audio.tags[key], list)
177-
else audio.tags[key]
178-
)
179-
tags[label] = str(value)
180-
except Exception:
181-
pass
141+
if not (hasattr(audio, "tags") and audio.tags):
142+
return tags
143+
144+
# ID3 tags (MP3)
145+
id3_mapping = {
146+
"TIT2": "Title",
147+
"TPE1": "Artist",
148+
"TALB": "Album",
149+
"TCON": "Genre",
150+
"TYER": "Year",
151+
"TDRC": "Date",
152+
"TRCK": "Track Number",
153+
"COMM": "Comments",
154+
"TPE2": "Album Artist",
155+
"TPUB": "Publisher",
156+
"TCOM": "Composer",
157+
}
158+
159+
for key, label in id3_mapping.items():
160+
if key in audio.tags:
161+
try:
162+
value = str(audio.tags[key])
163+
if value:
164+
tags[label] = value
165+
except Exception:
166+
pass
167+
168+
# MP4/M4A tags
169+
mp4_mapping = {
170+
"\xa9nam": "Title",
171+
"\xa9ART": "Artist",
172+
"\xa9alb": "Album",
173+
"\xa9gen": "Genre",
174+
"\xa9day": "Year",
175+
"trkn": "Track Number",
176+
"aART": "Album Artist",
177+
"\xa9wrt": "Composer",
178+
}
179+
for key, label in mp4_mapping.items():
180+
if label not in tags and key in audio.tags:
181+
try:
182+
value = (
183+
audio.tags[key][0] if isinstance(audio.tags[key], list) else audio.tags[key]
184+
)
185+
tags[label] = str(value)
186+
except Exception:
187+
pass
188+
189+
# Vorbis comments (FLAC, OGG) — keys are case-insensitive strings
190+
vorbis_mapping = {
191+
"title": "Title",
192+
"artist": "Artist",
193+
"album": "Album",
194+
"genre": "Genre",
195+
"date": "Date",
196+
"tracknumber": "Track Number",
197+
"albumartist": "Album Artist",
198+
"composer": "Composer",
199+
"comment": "Comments",
200+
}
201+
for key, label in vorbis_mapping.items():
202+
if label not in tags and key in audio.tags:
203+
try:
204+
value = audio.tags[key]
205+
if isinstance(value, list):
206+
value = value[0]
207+
tags[label] = str(value)
208+
except Exception:
209+
pass
182210

183211
return tags
184212

openviking/parse/parsers/powerpoint.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,21 +103,25 @@ def _convert_to_markdown(self, path: Path, pptx) -> str:
103103

104104
def _extract_slide_title(self, slide) -> str:
105105
"""Extract title from a slide."""
106+
from pptx.enum.shapes import PP_PLACEHOLDER
107+
106108
for shape in slide.shapes:
107109
if shape.is_placeholder:
108-
placeholder_format = shape.placeholder_format
109-
if placeholder_format.type == 1: # TITLE
110+
ph_type = shape.placeholder_format.type
111+
if ph_type in (PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE):
110112
return shape.text.strip()
111113
return ""
112114

113115
def _extract_slide_content(self, slide) -> str:
114116
"""Extract content from slide shapes."""
117+
from pptx.enum.shapes import PP_PLACEHOLDER
118+
115119
content_parts = []
116120

117121
for shape in slide.shapes:
118122
if shape.is_placeholder:
119-
placeholder_format = shape.placeholder_format
120-
if placeholder_format.type == 1: # TITLE
123+
ph_type = shape.placeholder_format.type
124+
if ph_type in (PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE):
121125
continue
122126

123127
if hasattr(shape, "text") and shape.text.strip():

openviking/parse/parsers/text.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,5 @@ async def parse_content(
3737
"""Parse text content - delegates to MarkdownParser."""
3838
result = await self._md_parser.parse_content(content, source_path, **kwargs)
3939
result.source_format = "text"
40+
result.parser_name = "TextParser"
4041
return result

openviking/parse/parsers/word.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -68,25 +68,42 @@ async def parse_content(
6868
return result
6969

7070
def _convert_to_markdown(self, path: Path, docx) -> str:
71-
"""Convert Word document to Markdown string."""
71+
"""Convert Word document to Markdown string.
72+
73+
Iterates the document body in order so that tables appear in their
74+
original position rather than being appended at the end.
75+
"""
7276
doc = docx.Document(path)
7377
markdown_parts = []
7478

75-
for paragraph in doc.paragraphs:
76-
if not paragraph.text.strip():
77-
continue
79+
# Map XML table elements to python-docx Table objects for O(1) lookup
80+
table_by_element = {table._tbl: table for table in doc.tables}
81+
82+
# Walk the document body in order to preserve table positions
83+
from docx.oxml.ns import qn
84+
85+
for child in doc.element.body:
86+
if child.tag == qn("w:p"):
87+
# It's a paragraph
88+
from docx.text.paragraph import Paragraph
89+
90+
paragraph = Paragraph(child, doc)
91+
if not paragraph.text.strip():
92+
continue
7893

79-
style_name = paragraph.style.name if paragraph.style else "Normal"
94+
style_name = paragraph.style.name if paragraph.style else "Normal"
8095

81-
if style_name.startswith("Heading"):
82-
level = self._extract_heading_level(style_name)
83-
markdown_parts.append(f"{'#' * level} {paragraph.text}")
84-
else:
85-
text = self._convert_formatted_text(paragraph)
86-
markdown_parts.append(text)
96+
if style_name.startswith("Heading"):
97+
level = self._extract_heading_level(style_name)
98+
markdown_parts.append(f"{'#' * level} {paragraph.text}")
99+
else:
100+
text = self._convert_formatted_text(paragraph)
101+
markdown_parts.append(text)
87102

88-
for table in doc.tables:
89-
markdown_parts.append(self._convert_table(table))
103+
elif child.tag == qn("w:tbl"):
104+
# It's a table
105+
if child in table_by_element:
106+
markdown_parts.append(self._convert_table(table_by_element[child]))
90107

91108
return "\n\n".join(markdown_parts)
92109

openviking/parse/parsers/zip_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,9 @@ def _generate_tree_view(self, filenames: List[str]) -> str:
217217

218218
# Add prefix for directories vs files
219219
if item.endswith("/"):
220-
prefix = "📁 "
220+
prefix = "[dir] "
221221
else:
222-
prefix = "📄 "
222+
prefix = ""
223223

224224
lines.append(f"{indent}{prefix}{name}")
225225

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ dependencies = [
5959
"pdfminer-six>=20251230",
6060
"typer>=0.12.0",
6161
]
62-
]
6362

6463
[project.optional-dependencies]
6564
test = [

0 commit comments

Comments
 (0)