Skip to content

Commit fd82aac

Browse files
authored
fix: resolve Gemini 404, directory collision, and Unicode decoding errors (#314)
- Fix Gemini 404 by preventing malformed api_base and global state pollution in LiteLLMVLMProvider. - Fix AGFSClientError by adding exist_ok=True to mkdir calls in multiple parsers. - Fix UnicodeDecodeError in VikingFS by implementing robust multi-stage decoding (UTF-8, GBK, Latin-1).
1 parent 9e69113 commit fd82aac

File tree

9 files changed

+56
-14
lines changed

9 files changed

+56
-14
lines changed

openviking/models/vlm/backends/litellm_vlm.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"litellm_prefix": "deepseek",
4747
},
4848
"gemini": {
49-
"keywords": ("gemini",),
49+
"keywords": ("gemini", "google"),
5050
"env_key": "GEMINI_API_KEY",
5151
"litellm_prefix": "gemini",
5252
},
@@ -105,23 +105,24 @@ def __init__(self, config: Dict[str, Any]):
105105
if self.api_key:
106106
self._setup_env(self.api_key, self.model)
107107

108-
if self.api_base:
109-
litellm.api_base = self.api_base
110-
108+
# Configure LiteLLM behavior (these are global but safe to re-set)
111109
litellm.suppress_debug_info = True
112110
litellm.drop_params = True
113111

114112
def _setup_env(self, api_key: str, model: str | None) -> None:
115113
"""Set environment variables based on detected provider."""
116114
provider = self._provider_name
117-
if not provider and model:
118-
provider = detect_provider_by_model(model)
115+
if (not provider or provider == "litellm") and model:
116+
detected = detect_provider_by_model(model)
117+
if detected:
118+
provider = detected
119119

120120
if provider and provider in PROVIDER_CONFIGS:
121121
env_key = PROVIDER_CONFIGS[provider]["env_key"]
122122
os.environ[env_key] = api_key
123123
self._detected_provider = provider
124124
else:
125+
# Fallback to OpenAI if provider is unknown or literal litellm
125126
os.environ["OPENAI_API_KEY"] = api_key
126127

127128
def _resolve_model(self, model: str) -> str:
@@ -202,7 +203,14 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]:
202203
if self.api_key:
203204
kwargs["api_key"] = self.api_key
204205
if self.api_base:
205-
kwargs["api_base"] = self.api_base
206+
# For Gemini, LiteLLM constructs the URL itself. If user provides a full Google endpoint
207+
# as api_base, it might break the URL construction in LiteLLM.
208+
# We only pass api_base if it doesn't look like a standard Google endpoint versioned URL.
209+
is_google_endpoint = "generativelanguage.googleapis.com" in self.api_base and (
210+
"/v1" in self.api_base or "/v1beta" in self.api_base
211+
)
212+
if not is_google_endpoint:
213+
kwargs["api_base"] = self.api_base
206214
if self._extra_headers:
207215
kwargs["extra_headers"] = self._extra_headers
208216

openviking/parse/parsers/directory.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ async def parse(
112112
viking_fs = self._get_viking_fs()
113113
temp_uri = self._create_temp_uri()
114114
target_uri = f"{temp_uri}/{dir_name}"
115-
await viking_fs.mkdir(temp_uri)
115+
await viking_fs.mkdir(temp_uri, exist_ok=True)
116+
await viking_fs.mkdir(target_uri, exist_ok=True)
116117
await viking_fs.mkdir(target_uri)
117118

118119
if not processable_files:

openviking/parse/parsers/markdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ async def _save_section(
529529

530530
# Create directory and handle children or split
531531
section_dir = f"{parent_dir}/{name}"
532-
await viking_fs.mkdir(section_dir)
532+
await viking_fs.mkdir(section_dir, exist_ok=True)
533533

534534
if has_children:
535535
await self._process_children(

openviking/parse/parsers/media/audio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
9191
ext_no_dot = ext[1:] if ext else ""
9292
root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
9393
root_dir_uri = f"{temp_uri}/{root_dir_name}"
94-
await viking_fs.mkdir(root_dir_uri)
94+
await viking_fs.mkdir(root_dir_uri, exist_ok=True)
9595

9696
# 1.1 Save original audio with original filename (sanitized)
9797
await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", audio_bytes)

openviking/parse/parsers/media/image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
106106
ext_no_dot = ext[1:] if ext else ""
107107
root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
108108
root_dir_uri = f"{temp_uri}/{root_dir_name}"
109-
await viking_fs.mkdir(root_dir_uri)
109+
await viking_fs.mkdir(root_dir_uri, exist_ok=True)
110110

111111
# 1.1 Save original image with original filename (sanitized)
112112
await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", image_bytes)

openviking/parse/parsers/media/video.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
9191
ext_no_dot = ext[1:] if ext else ""
9292
root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
9393
root_dir_uri = f"{temp_uri}/{root_dir_name}"
94-
await viking_fs.mkdir(root_dir_uri)
94+
await viking_fs.mkdir(root_dir_uri, exist_ok=True)
9595

9696
# 1.1 Save original video with original filename (sanitized)
9797
await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", video_bytes)

openviking/parse/tree_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ async def _ensure_parent_dirs(self, uri: str, ctx: RequestContext) -> None:
248248

249249
# Create parent directory (ignore if already exists)
250250
try:
251-
await viking_fs.mkdir(parent_uri, ctx=ctx)
251+
await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx)
252252
logger.debug(f"Created parent directory: {parent_uri}")
253253
except Exception as e:
254254
# Directory may already exist, ignore error

openviking/storage/queuefs/semantic_processor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,10 @@ async def _generate_text_summary(
318318

319319
# Read file content (limit length)
320320
content = await viking_fs.read_file(file_path, ctx=active_ctx)
321+
322+
# Limit content length (about 10000 tokens)
323+
max_chars = 30000
324+
content = await viking_fs.read_file(file_path, ctx=active_ctx)
321325
if isinstance(content, bytes):
322326
# Try to decode with error handling for text files
323327
try:

openviking/storage/viking_fs.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,8 +958,37 @@ def _handle_agfs_read(self, result: Union[bytes, Any, None]) -> bytes:
958958
except Exception:
959959
return b""
960960

961+
def _decode_bytes(self, data: bytes) -> str:
962+
"""Robustly decode bytes to string."""
963+
if not data:
964+
return ""
965+
try:
966+
return data.decode("utf-8")
967+
except UnicodeDecodeError:
968+
try:
969+
# Try common encoding for Windows/legacy files in China
970+
return data.decode("gbk")
971+
except UnicodeDecodeError:
972+
try:
973+
return data.decode("latin-1")
974+
except UnicodeDecodeError:
975+
return data.decode("utf-8", errors="replace")
976+
961977
def _handle_agfs_content(self, result: Union[bytes, Any, None]) -> str:
962978
"""Handle AGFSClient content return types consistently."""
979+
if isinstance(result, bytes):
980+
return self._decode_bytes(result)
981+
elif hasattr(result, "content") and result.content is not None:
982+
return self._decode_bytes(result.content)
983+
elif result is None:
984+
return ""
985+
else:
986+
# Try to convert to string
987+
try:
988+
return str(result)
989+
except Exception:
990+
return ""
991+
"""Handle AGFSClient content return types consistently."""
963992
if isinstance(result, bytes):
964993
return result.decode("utf-8")
965994
elif hasattr(result, "content"):
@@ -1282,7 +1311,7 @@ async def append_file(
12821311
existing = ""
12831312
try:
12841313
existing_bytes = self._handle_agfs_read(self.agfs.read(path))
1285-
existing = existing_bytes.decode("utf-8")
1314+
existing = self._decode_bytes(existing_bytes)
12861315
except Exception:
12871316
pass
12881317

0 commit comments

Comments
 (0)