Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions examples/ov.conf.example
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,12 @@
"max_line_length": 1000,
"max_token_limit": 50000,
"truncation_strategy": "head",
"warn_on_truncation": true
},
"warn_on_truncation": true,
"github_raw_domain": "raw.githubusercontent.com",
"code_hosting_domains": ["github.com", "gitlab.com"],
"github_domains": ["github.com", "github.com"],
"gitlab_domains": ["gitlab.com", "www.gitlab.com"]
}
"image": {
"enable_ocr": false,
"enable_vlm": true,
Expand Down Expand Up @@ -126,8 +130,11 @@
"extract_text_only": false,
"preserve_structure": true,
"clean_html": true,
"extract_metadata": true
},
"extract_metadata": true,
"code_hosting_domains": ["github.com", "gitlab.com"],
"github_domains": ["github.com", "github.com"],
"gitlab_domains": ["gitlab.com", "www.gitlab.com"]
}
"text": {
"detect_language": true,
"split_by_paragraphs": true,
Expand Down
29 changes: 25 additions & 4 deletions openviking/parse/parsers/code/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
IGNORE_EXTENSIONS,
)
from openviking.parse.parsers.upload_utils import upload_directory
from openviking.utils import is_github_url, parse_code_hosting_url
from openviking_cli.utils.config import get_openviking_config
from openviking_cli.utils.logger import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -132,8 +134,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
# 3. Create VikingFS temp URI
viking_fs = self._get_viking_fs()
temp_viking_uri = self._create_temp_uri()
# The structure in temp should be: viking://temp/{uuid}/{repo_name}/...
target_root_uri = f"{temp_viking_uri}/{repo_name}"
# The structure in temp should be: viking://temp/{uuid}/repository/...
# Use simple name 'repository' for temp, TreeBuilder will rename it to org/repo later
target_root_uri = f"{temp_viking_uri}/repository"

logger.info(f"Uploading to VikingFS: {target_root_uri}")

Expand Down Expand Up @@ -243,18 +246,36 @@ def _normalize_repo_url(self, url: str) -> str:
git_index = next((i for i, p in enumerate(path_parts) if p.endswith(".git")), None)
if git_index is not None:
base_parts = path_parts[: git_index + 1]
elif parsed.netloc in ["github.com", "gitlab.com"] and len(path_parts) >= 2:

config = get_openviking_config()
if (
parsed.netloc in config.code.github_domains + config.code.gitlab_domains
and len(path_parts) >= 2
):
base_parts = path_parts[:2]
base_path = "/" + "/".join(base_parts)
return parsed._replace(path=base_path, query="", fragment="").geturl()
return url

def _get_repo_name(self, url: str) -> str:
"""Get repository name with organization for GitHub/GitLab URLs.

For https://github.com/volcengine/OpenViking, returns "volcengine/OpenViking"
For other URLs, falls back to just the repo name.
"""
# First try to parse as code hosting URL
parsed_org_repo = parse_code_hosting_url(url)
if parsed_org_repo:
return parsed_org_repo

# Fallback for other URLs
name_source = url
if url.startswith(("http://", "https://", "git://", "ssh://")):
name_source = urlparse(url).path.rstrip("/")
elif ":" in url and not url.startswith("file://"):
name_source = url.split(":", 1)[1]

# Original logic for non-GitHub/GitLab URLs
name = name_source.rstrip("/").split("/")[-1]
if name.endswith(".git"):
name = name[:-4]
Expand Down Expand Up @@ -284,7 +305,7 @@ async def _has_commit(self, repo_dir: str, commit: str) -> bool:
@staticmethod
def _is_github_url(url: str) -> bool:
"""Return True for github.com URLs (supports ZIP archive API)."""
return urlparse(url).netloc in ("github.com", "github.com")
return is_github_url(url)

async def _github_zip_download(
self,
Expand Down
39 changes: 29 additions & 10 deletions openviking/parse/parsers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
lazy_import,
)
from openviking.parse.parsers.base_parser import BaseParser
from openviking_cli.utils.config import get_openviking_config


class URLType(Enum):
Expand Down Expand Up @@ -141,13 +142,27 @@ def _is_code_repository_url(self, url: str) -> bool:
"""
import re

# Repository URL patterns (from README)
repo_patterns = [
r"^https?://github\.com/[^/]+/[^/]+/?$",
r"^https?://gitlab\.com/[^/]+/[^/]+/?$",
r"^.*\.git$",
r"^git@",
]
config = get_openviking_config()
github_domains = list(set(config.html.github_domains + config.code.github_domains))
gitlab_domains = list(set(config.html.gitlab_domains + config.code.gitlab_domains))
# Build repository URL patterns from config
repo_patterns = []

# Add patterns for GitHub domains
for domain in github_domains:
repo_patterns.append(rf"^https?://{re.escape(domain)}/[^/]+/[^/]+/?$")

# Add patterns for GitLab domains
for domain in gitlab_domains:
repo_patterns.append(rf"^https?://{re.escape(domain)}/[^/]+/[^/]+/?$")

# Add other patterns
repo_patterns.extend(
[
r"^.*\.git$",
r"^git@",
]
)

# Check for URL patterns
for pattern in repo_patterns:
Expand Down Expand Up @@ -478,15 +493,19 @@ async def _fetch_html(self, url: str) -> str:
def _convert_to_raw_url(self, url: str) -> str:
"""Convert GitHub/GitLab blob URL to raw URL."""
parsed = urlparse(url)
config = get_openviking_config()
github_domains = config.html.github_domains
gitlab_domains = config.html.gitlab_domains
github_raw_domain = config.code.github_raw_domain

if parsed.netloc == "github.com":
if parsed.netloc in github_domains:
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) >= 4 and path_parts[2] == "blob":
# Remove 'blob'
new_path = "/".join(path_parts[:2] + path_parts[3:])
return f"https://raw.githubusercontent.com/{new_path}"
return f"https://{github_raw_domain}/{new_path}"

if parsed.netloc == "gitlab.com" and "/blob/" in parsed.path:
if parsed.netloc in gitlab_domains and "/blob/" in parsed.path:
return url.replace("/blob/", "/raw/")

return url
Expand Down
34 changes: 31 additions & 3 deletions openviking/parse/tree_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from openviking.parse.parsers.media.utils import get_media_base_uri, get_media_type
from openviking.storage.queuefs import SemanticMsg, get_queue_manager
from openviking.storage.viking_fs import get_viking_fs
from openviking.utils import parse_code_hosting_url
from openviking_cli.utils.uri import VikingURI

if TYPE_CHECKING:
Expand Down Expand Up @@ -127,12 +128,39 @@ async def finalize_from_temp(
doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"])
temp_doc_uri = f"{temp_uri}/{doc_name}"

# 2. Determine base_uri
# 2. Determine base_uri and final document name with org/repo for GitHub/GitLab
if base_uri is None:
base_uri = self._get_base_uri(scope, source_path, source_format)

# 3. Build final URI, auto-renaming on conflict (e.g. doc_1, doc_2, ...)
candidate_uri = VikingURI(base_uri).join(doc_name).uri
# Check if source_path is a GitHub/GitLab URL and extract org/repo
final_doc_name = doc_name
if source_path and source_format == "repository":
parsed_org_repo = parse_code_hosting_url(source_path)
if parsed_org_repo:
final_doc_name = parsed_org_repo

# 3. Check if base_uri exists - if it does, use it as parent directory
try:
await viking_fs.stat(base_uri)
base_exists = True
except Exception:
base_exists = False

if base_exists:
if "/" in final_doc_name:
repo_name_only = final_doc_name.split("/")[-1]
else:
repo_name_only = final_doc_name
candidate_uri = VikingURI(base_uri).join(repo_name_only).uri
else:
if "/" in final_doc_name:
parts = final_doc_name.split("/")
sanitized_parts = [VikingURI.sanitize_segment(p) for p in parts if p]
base_viking_uri = VikingURI(base_uri)
candidate_uri = VikingURI.build(base_viking_uri.scope, *sanitized_parts)
else:
candidate_uri = VikingURI(base_uri).join(doc_name).uri

final_uri = await self._resolve_unique_uri(candidate_uri)
if final_uri != candidate_uri:
logger.info(f"[TreeBuilder] Resolved name conflict: {candidate_uri} -> {final_uri}")
Expand Down
10 changes: 10 additions & 0 deletions openviking/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
# SPDX-License-Identifier: Apache-2.0
"""Utility functions and helpers."""

from openviking.utils.code_hosting_utils import (
is_code_hosting_url,
is_github_url,
is_gitlab_url,
parse_code_hosting_url,
)
from openviking.utils.time_utils import get_current_timestamp
from openviking_cli.utils.async_utils import run_async
from openviking_cli.utils.llm import StructuredLLM, parse_json_from_response, parse_json_to_model
Expand All @@ -17,4 +23,8 @@
"parse_json_from_response",
"parse_json_to_model",
"run_async",
"parse_code_hosting_url",
"is_github_url",
"is_gitlab_url",
"is_code_hosting_url",
]
95 changes: 95 additions & 0 deletions openviking/utils/code_hosting_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: Apache-2.0
"""
Utilities for code hosting platform URL parsing.

This module provides shared functionality for parsing URLs from code hosting
platforms like GitHub and GitLab.
"""

from typing import Optional
from urllib.parse import urlparse

from openviking_cli.utils.config import get_openviking_config


def parse_code_hosting_url(url: str) -> Optional[str]:
"""Parse code hosting platform URL to get org/repo path.

Args:
url: Code hosting URL like https://github.com/volcengine/OpenViking

Returns:
org/repo path like "volcengine/OpenViking" or None if not a valid
code hosting URL
"""
if not url.startswith(("http://", "https://", "git://", "ssh://")):
return None

parsed = urlparse(url)
path_parts = [p for p in parsed.path.split("/") if p]

config = get_openviking_config()

# For GitHub/GitLab URLs with org/repo structure
if (
parsed.netloc in config.code.github_domains + config.code.gitlab_domains
and len(path_parts) >= 2
):
# Take first two parts: org/repo
org = path_parts[0]
repo = path_parts[1]
if repo.endswith(".git"):
repo = repo[:-4]
# Sanitize both parts
org = "".join(c if c.isalnum() or c in "-_" else "_" for c in org)
repo = "".join(c if c.isalnum() or c in "-_" else "_" for c in repo)
return f"{org}/{repo}"

return None


def is_github_url(url: str) -> bool:
"""Check if a URL is a GitHub URL.

Args:
url: URL to check

Returns:
True if the URL is a GitHub URL
"""
config = get_openviking_config()
return urlparse(url).netloc in config.code.github_domains


def is_gitlab_url(url: str) -> bool:
"""Check if a URL is a GitLab URL.

Args:
url: URL to check

Returns:
True if the URL is a GitLab URL
"""
config = get_openviking_config()
return urlparse(url).netloc in config.code.gitlab_domains


def is_code_hosting_url(url: str) -> bool:
"""Check if a URL is a code hosting platform URL.

Args:
url: URL to check

Returns:
True if the URL is a code hosting platform URL
"""
config = get_openviking_config()
all_domains = list(
set(
config.code.github_domains
+ config.code.gitlab_domains
+ config.code.code_hosting_domains
)
)
return urlparse(url).netloc in all_domains
3 changes: 3 additions & 0 deletions openviking/utils/resource_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ async def process_resource(
result["status"] = "error"
result["errors"].append(f"Parse error: {e}")
logger.error(f"[ResourceProcessor] Parse error: {e}")
import traceback

traceback.print_exc()
return result

# parse_result contains:
Expand Down
Loading