fix: improve markitdown parser consistency and add real-file tests

ZaynJarvis · ZaynJarvis · commit 224939cb3b34 · 2026-02-12T09:49:28.000+08:00
Critical fixes:
- WordParser: preserve table position in document order (was appending
  all tables at end, losing context). Walk document body XML in order
  instead of iterating paragraphs then tables separately.
- PowerPointParser: replace magic number (type == 1) with proper
  PP_PLACEHOLDER enum constants, also handle CENTER_TITLE.
- AudioParser: add Vorbis/FLAC/OGG tag extraction (previously only
  handled ID3 and MP4 formats). Tries all format mappings with dedup.
- ZipParser: replace emoji in tree view with plain text markers
  for robustness in text processing pipelines.
- TextParser: set parser_name='TextParser' on parse_content results
  for consistency with all other parsers.
- __init__.py: export all new parser classes for public API.

Tests (16 new, 39 total):
- Real .docx/.xlsx/.pptx file creation and parsing
- EPub HTML-to-markdown conversion edge cases
- ZIP bad-file error handling and no-emoji tree view
- AudioParser Vorbis tag extraction and edge cases
- WordParser can_parse() extension matching
diff --git a/openviking/parse/parsers/__init__.py b/openviking/parse/parsers/__init__.py
@@ -1,20 +1,32 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
 
+from .audio import AudioParser
 from .base_parser import BaseParser
 from .code import CodeRepositoryParser
+from .epub import EPubParser
+from .excel import ExcelParser
 from .html import HTMLParser, URLType, URLTypeDetector
 from .markdown import MarkdownParser
 from .pdf import PDFParser
+from .powerpoint import PowerPointParser
 from .text import TextParser
+from .word import WordParser
+from .zip_parser import ZipParser
 
 __all__ = [
+    "AudioParser",
     "BaseParser",
     "CodeRepositoryParser",
+    "EPubParser",
+    "ExcelParser",
     "HTMLParser",
     "URLType",
     "URLTypeDetector",
     "MarkdownParser",
     "PDFParser",
+    "PowerPointParser",
     "TextParser",
+    "WordParser",
+    "ZipParser",
 ]
diff --git a/openviking/parse/parsers/audio.py b/openviking/parse/parsers/audio.py
@@ -131,54 +131,82 @@ def _convert_to_markdown(self, path: Path) -> str:
         return "\n\n".join(markdown_parts)
 
     def _extract_tags(self, audio) -> dict:
-        """Extract metadata tags from audio file."""
+        """Extract metadata tags from audio file.
+
+        Handles ID3 (MP3), MP4/M4A, and Vorbis/FLAC/OGG tag formats
+        via mutagen's format-specific tag keys.
+        """
         tags = {}
 
-        if hasattr(audio, "tags") and audio.tags:
-            tag_mapping = {
-                "TIT2": "Title",
-                "TPE1": "Artist",
-                "TALB": "Album",
-                "TCON": "Genre",
-                "TYER": "Year",
-                "TDRC": "Date",
-                "TRCK": "Track Number",
-                "COMM": "Comments",
-                "TPE2": "Album Artist",
-                "TPUB": "Publisher",
-                "TCOM": "Composer",
-            }
-
-            for key, label in tag_mapping.items():
-                if key in audio.tags:
-                    try:
-                        value = str(audio.tags[key])
-                        if value:
-                            tags[label] = value
-                    except Exception:
-                        pass
-
-            # For MP4/M4A files
-            if hasattr(audio.tags, "_DictProxy__dict"):
-                mp4_mapping = {
-                    "\xa9nam": "Title",
-                    "\xa9ART": "Artist",
-                    "\xa9alb": "Album",
-                    "\xa9gen": "Genre",
-                    "\xa9day": "Year",
-                    "trkn": "Track Number",
-                }
-                for key, label in mp4_mapping.items():
-                    if key in audio.tags:
-                        try:
-                            value = (
-                                audio.tags[key][0]
-                                if isinstance(audio.tags[key], list)
-                                else audio.tags[key]
-                            )
-                            tags[label] = str(value)
-                        except Exception:
-                            pass
+        if not (hasattr(audio, "tags") and audio.tags):
+            return tags
+
+        # ID3 tags (MP3)
+        id3_mapping = {
+            "TIT2": "Title",
+            "TPE1": "Artist",
+            "TALB": "Album",
+            "TCON": "Genre",
+            "TYER": "Year",
+            "TDRC": "Date",
+            "TRCK": "Track Number",
+            "COMM": "Comments",
+            "TPE2": "Album Artist",
+            "TPUB": "Publisher",
+            "TCOM": "Composer",
+        }
+
+        for key, label in id3_mapping.items():
+            if key in audio.tags:
+                try:
+                    value = str(audio.tags[key])
+                    if value:
+                        tags[label] = value
+                except Exception:
+                    pass
+
+        # MP4/M4A tags
+        mp4_mapping = {
+            "\xa9nam": "Title",
+            "\xa9ART": "Artist",
+            "\xa9alb": "Album",
+            "\xa9gen": "Genre",
+            "\xa9day": "Year",
+            "trkn": "Track Number",
+            "aART": "Album Artist",
+            "\xa9wrt": "Composer",
+        }
+        for key, label in mp4_mapping.items():
+            if label not in tags and key in audio.tags:
+                try:
+                    value = (
+                        audio.tags[key][0] if isinstance(audio.tags[key], list) else audio.tags[key]
+                    )
+                    tags[label] = str(value)
+                except Exception:
+                    pass
+
+        # Vorbis comments (FLAC, OGG) — keys are case-insensitive strings
+        vorbis_mapping = {
+            "title": "Title",
+            "artist": "Artist",
+            "album": "Album",
+            "genre": "Genre",
+            "date": "Date",
+            "tracknumber": "Track Number",
+            "albumartist": "Album Artist",
+            "composer": "Composer",
+            "comment": "Comments",
+        }
+        for key, label in vorbis_mapping.items():
+            if label not in tags and key in audio.tags:
+                try:
+                    value = audio.tags[key]
+                    if isinstance(value, list):
+                        value = value[0]
+                    tags[label] = str(value)
+                except Exception:
+                    pass
 
         return tags
 
diff --git a/openviking/parse/parsers/powerpoint.py b/openviking/parse/parsers/powerpoint.py
@@ -103,21 +103,25 @@ def _convert_to_markdown(self, path: Path, pptx) -> str:
 
     def _extract_slide_title(self, slide) -> str:
         """Extract title from a slide."""
+        from pptx.enum.shapes import PP_PLACEHOLDER
+
         for shape in slide.shapes:
             if shape.is_placeholder:
-                placeholder_format = shape.placeholder_format
-                if placeholder_format.type == 1:  # TITLE
+                ph_type = shape.placeholder_format.type
+                if ph_type in (PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE):
                     return shape.text.strip()
         return ""
 
     def _extract_slide_content(self, slide) -> str:
         """Extract content from slide shapes."""
+        from pptx.enum.shapes import PP_PLACEHOLDER
+
         content_parts = []
 
         for shape in slide.shapes:
             if shape.is_placeholder:
-                placeholder_format = shape.placeholder_format
-                if placeholder_format.type == 1:  # TITLE
+                ph_type = shape.placeholder_format.type
+                if ph_type in (PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE):
                     continue
 
             if hasattr(shape, "text") and shape.text.strip():
diff --git a/openviking/parse/parsers/text.py b/openviking/parse/parsers/text.py
@@ -37,4 +37,5 @@ async def parse_content(
         """Parse text content - delegates to MarkdownParser."""
         result = await self._md_parser.parse_content(content, source_path, **kwargs)
         result.source_format = "text"
+        result.parser_name = "TextParser"
         return result
diff --git a/openviking/parse/parsers/word.py b/openviking/parse/parsers/word.py
@@ -68,25 +68,42 @@ async def parse_content(
         return result
 
     def _convert_to_markdown(self, path: Path, docx) -> str:
-        """Convert Word document to Markdown string."""
+        """Convert Word document to Markdown string.
+
+        Iterates the document body in order so that tables appear in their
+        original position rather than being appended at the end.
+        """
         doc = docx.Document(path)
         markdown_parts = []
 
-        for paragraph in doc.paragraphs:
-            if not paragraph.text.strip():
-                continue
+        # Map XML table elements to python-docx Table objects for O(1) lookup
+        table_by_element = {table._tbl: table for table in doc.tables}
+
+        # Walk the document body in order to preserve table positions
+        from docx.oxml.ns import qn
+
+        for child in doc.element.body:
+            if child.tag == qn("w:p"):
+                # It's a paragraph
+                from docx.text.paragraph import Paragraph
+
+                paragraph = Paragraph(child, doc)
+                if not paragraph.text.strip():
+                    continue
 
-            style_name = paragraph.style.name if paragraph.style else "Normal"
+                style_name = paragraph.style.name if paragraph.style else "Normal"
 
-            if style_name.startswith("Heading"):
-                level = self._extract_heading_level(style_name)
-                markdown_parts.append(f"{'#' * level} {paragraph.text}")
-            else:
-                text = self._convert_formatted_text(paragraph)
-                markdown_parts.append(text)
+                if style_name.startswith("Heading"):
+                    level = self._extract_heading_level(style_name)
+                    markdown_parts.append(f"{'#' * level} {paragraph.text}")
+                else:
+                    text = self._convert_formatted_text(paragraph)
+                    markdown_parts.append(text)
 
-        for table in doc.tables:
-            markdown_parts.append(self._convert_table(table))
+            elif child.tag == qn("w:tbl"):
+                # It's a table
+                if child in table_by_element:
+                    markdown_parts.append(self._convert_table(table_by_element[child]))
 
         return "\n\n".join(markdown_parts)
 
diff --git a/openviking/parse/parsers/zip_parser.py b/openviking/parse/parsers/zip_parser.py
@@ -217,9 +217,9 @@ def _generate_tree_view(self, filenames: List[str]) -> str:
 
             # Add prefix for directories vs files
             if item.endswith("/"):
-                prefix = "📁 "
+                prefix = "[dir] "
             else:
-                prefix = "📄 "
+                prefix = ""
 
             lines.append(f"{indent}{prefix}{name}")
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,7 +59,6 @@ dependencies = [
     "pdfminer-six>=20251230",
     "typer>=0.12.0",
 ]
-]
 
 [project.optional-dependencies]
 test = [
diff --git a/tests/parse/test_markitdown_parsers.py b/tests/parse/test_markitdown_parsers.py

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,6 @@ dependencies = [`
`59`	`59`	`"pdfminer-six>=20251230",`
`60`	`60`	`"typer>=0.12.0",`
`61`	`61`	`]`
`62`		`-]`
`63`	`62`
`64`	`63`	`[project.optional-dependencies]`
`65`	`64`	`test = [`