Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 87 additions & 1 deletion openviking/models/vlm/backends/volcengine_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@

import asyncio
import base64
import logging
from pathlib import Path
from typing import Any, Dict, List, Union

from .openai_vlm import OpenAIVLM

logger = logging.getLogger(__name__)


class VolcEngineVLM(OpenAIVLM):
"""VolcEngine VLM backend"""
Expand Down Expand Up @@ -98,13 +101,81 @@ async def get_completion_async(
else:
raise RuntimeError("Unknown error in async completion")

def _detect_image_format(self, data: bytes) -> str:
"""Detect image format from magic bytes.

Returns the MIME type, or raises ValueError for unsupported formats like SVG.

Supported formats per VolcEngine docs:
https://www.volcengine.com/docs/82379/1362931
- JPEG, PNG, GIF, WEBP, BMP, TIFF, ICO, DIB, ICNS, SGI, JPEG2000, HEIC, HEIF
"""
if len(data) < 12:
logger.warning(f"[VolcEngineVLM] Image data too small: {len(data)} bytes")
return "image/png"

# PNG: 89 50 4E 47 0D 0A 1A 0A
if data[:8] == b'\x89PNG\r\n\x1a\n':
return "image/png"
# JPEG: FF D8
elif data[:2] == b'\xff\xd8':
return "image/jpeg"
# GIF: GIF87a or GIF89a
elif data[:6] in (b'GIF87a', b'GIF89a'):
return "image/gif"
# WEBP: RIFF....WEBP
elif data[:4] == b'RIFF' and len(data) >= 12 and data[8:12] == b'WEBP':
return "image/webp"
# BMP: BM
elif data[:2] == b'BM':
return "image/bmp"
# TIFF (little-endian): 49 49 2A 00
# TIFF (big-endian): 4D 4D 00 2A
elif data[:4] == b'II*\x00' or data[:4] == b'MM\x00*':
return "image/tiff"
# ICO: 00 00 01 00
elif data[:4] == b'\x00\x00\x01\x00':
return "image/ico"
# ICNS: 69 63 6E 73 ("icns")
elif data[:4] == b'icns':
return "image/icns"
# SGI: 01 DA
elif data[:2] == b'\x01\xda':
return "image/sgi"
# JPEG2000: 00 00 00 0C 6A 50 20 20 (JP2 signature)
elif data[:8] == b'\x00\x00\x00\x0cjP ' or data[:4] == b'\xff\x4f\xff\x51':
return "image/jp2"
# HEIC/HEIF: ftyp box with heic/heif brand
# 00 00 00 XX 66 74 79 70 68 65 69 63 (heic)
# 00 00 00 XX 66 74 79 70 68 65 69 66 (heif)
elif len(data) >= 12 and data[4:8] == b'ftyp':
brand = data[8:12]
if brand == b'heic':
return "image/heic"
elif brand == b'heif':
return "image/heif"
elif brand[:3] == b'mif':
return "image/heif"
# SVG (not supported)
elif data[:4] == b'<svg' or (data[:5] == b'<?xml' and b'<svg' in data[:100]):
raise ValueError(
"SVG format is not supported by VolcEngine VLM API. "
"Supported formats: JPEG, PNG, GIF, WEBP, BMP, TIFF, ICO, ICNS, SGI, JPEG2000, HEIC, HEIF"
)

# Unknown format - log and default to PNG
logger.warning(f"[VolcEngineVLM] Unknown image format, magic bytes: {data[:16].hex()}")
return "image/png"

def _prepare_image(self, image: Union[str, Path, bytes]) -> Dict[str, Any]:
"""Prepare image data"""
if isinstance(image, bytes):
b64 = base64.b64encode(image).decode("utf-8")
mime_type = self._detect_image_format(image)
logger.info(f"[VolcEngineVLM] Preparing image from bytes, size={len(image)}, detected mime={mime_type}")
return {
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64}"},
"image_url": {"url": f"data:{mime_type};base64,{b64}"},
}
elif isinstance(image, Path) or (
isinstance(image, str) and not image.startswith(("http://", "https://"))
Expand All @@ -117,6 +188,21 @@ def _prepare_image(self, image: Union[str, Path, bytes]) -> Dict[str, Any]:
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/bmp",
".dib": "image/bmp",
".tiff": "image/tiff",
".tif": "image/tiff",
".ico": "image/ico",
".icns": "image/icns",
".sgi": "image/sgi",
".j2c": "image/jp2",
".j2k": "image/jp2",
".jp2": "image/jp2",
".jpc": "image/jp2",
".jpf": "image/jp2",
".jpx": "image/jp2",
".heic": "image/heic",
".heif": "image/heif",
}.get(suffix, "image/png")
with open(path, "rb") as f:
b64 = base64.b64encode(f.read()).decode("utf-8")
Expand Down
19 changes: 9 additions & 10 deletions openviking/models/vlm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,21 +127,20 @@ def create(config: Dict[str, Any]) -> VLMBase:
"""
provider = config.get("provider") or config.get("backend") or "openai"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里可以用 .lower() 处理下,避免用户大小写配置错误


use_litellm = config.get("use_litellm", True)
if provider == "volcengine":
from .backends.volcengine_vlm import VolcEngineVLM

if not use_litellm:
if provider == "openai":
return VolcEngineVLM(config)

elif provider == "openai":
from .backends.openai_vlm import OpenAIVLM

return OpenAIVLM(config)
elif provider == "volcengine":
from .backends.volcengine_vlm import VolcEngineVLM

return VolcEngineVLM(config)

from .backends.litellm_vlm import LiteLLMVLMProvider

return LiteLLMVLMProvider(config)
else:
from .backends.litellm_vlm import LiteLLMVLMProvider

return LiteLLMVLMProvider(config)

@staticmethod
def get_available_providers() -> List[str]:
Expand Down
47 changes: 47 additions & 0 deletions openviking/parse/parsers/media/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,38 @@
logger = get_logger(__name__)


def _is_svg(data: bytes) -> bool:
"""Check if the data is an SVG file."""
return data[:4] == b'<svg' or (data[:5] == b'<?xml' and b'<svg' in data[:100])


# SVG to PNG conversion (disabled by default)
# Uncomment and install dependencies if you need SVG support:
# Ubuntu/Debian: sudo apt-get install libcairo2 && pip install cairosvg
# macOS: brew install cairo && pip install cairosvg
# Or use ImageMagick: sudo apt-get install libmagickwand-dev && pip install Wand
#
# def _convert_svg_to_png(svg_data: bytes) -> Optional[bytes]:
# """Convert SVG to PNG using cairosvg or wand."""
# try:
# import cairosvg
# return cairosvg.svg2png(bytestring=svg_data)
# except ImportError:
# pass
# except OSError:
# pass # libcairo not installed
#
# try:
# from wand.image import Image as WandImage
# with WandImage(blob=svg_data, format='svg') as img:
# img.format = 'png'
# return img.make_blob()
# except ImportError:
# pass
#
# return None


def get_media_type(source_path: Optional[str], source_format: Optional[str]) -> Optional[str]:
"""
Determine media type from source path or format.
Expand Down Expand Up @@ -85,6 +117,14 @@ async def generate_image_summary(
if not isinstance(image_bytes, bytes):
raise ValueError(f"Expected bytes for image file, got {type(image_bytes)}")

# Check for unsupported formats (SVG, etc.) by detecting magic bytes
# SVG format is not supported by VolcEngine VLM API, skip VLM analysis
if _is_svg(image_bytes):
logger.info(
f"[MediaUtils.generate_image_summary] SVG format detected, skipping VLM analysis: {image_uri}"
)
return {"name": file_name, "summary": "SVG image (format not supported by VLM)"}

logger.info(
f"[MediaUtils.generate_image_summary] Generating summary for image: {image_uri}"
)
Expand All @@ -107,6 +147,13 @@ async def generate_image_summary(
)
return {"name": file_name, "summary": response.strip()}

except ValueError as e:
if "SVG format" in str(e) or "not supported" in str(e):
logger.warning(
f"[MediaUtils.generate_image_summary] Unsupported image format for {image_uri}: {e}"
)
return {"name": file_name, "summary": f"Unsupported image format: {str(e)}"}
raise
except Exception as e:
logger.error(
f"[MediaUtils.generate_image_summary] Failed to generate image summary: {e}",
Expand Down
Loading