Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions openviking/parse/parsers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,6 @@ def _is_code_repository_url(self, url: str) -> bool:
if re.match(pattern, url):
return True

# Check if it's a GitHub/GitLab URL
parsed = urlparse(url)
if parsed.netloc in ["github.com", "gitlab.com"]:
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) >= 2:
return True

return False


Expand Down Expand Up @@ -359,14 +352,17 @@ async def _handle_download_link(
# Get appropriate parser
if file_type == "pdf":
from openviking.parse.parsers.pdf import PDFParser

parser = PDFParser()
result = await parser.parse(temp_path)
elif file_type == "markdown":
from openviking.parse.parsers.markdown import MarkdownParser

parser = MarkdownParser()
result = await parser.parse(temp_path)
elif file_type == "text":
from openviking.parse.parsers.text import TextParser

parser = TextParser()
result = await parser.parse(temp_path)
elif file_type == "html":
Expand Down Expand Up @@ -478,6 +474,22 @@ async def _fetch_html(self, url: str) -> str:
response.raise_for_status()
return response.text

def _convert_to_raw_url(self, url: str) -> str:
"""Convert GitHub/GitLab blob URL to raw URL."""
parsed = urlparse(url)

if parsed.netloc == "github.com":
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) >= 4 and path_parts[2] == "blob":
# Remove 'blob'
new_path = "/".join(path_parts[:2] + path_parts[3:])
return f"https://raw.githubusercontent.com/{new_path}"

if parsed.netloc == "gitlab.com" and "/blob/" in parsed.path:
return url.replace("/blob/", "/raw/")

return url

async def _download_file(self, url: str) -> str:
"""
Download file from URL to temporary location.
Expand All @@ -493,6 +505,8 @@ async def _download_file(self, url: str) -> str:
"""
httpx = lazy_import("httpx")

url = self._convert_to_raw_url(url)

# Determine file extension from URL
parsed = urlparse(url)
ext = Path(parsed.path).suffix or ".tmp"
Expand Down
53 changes: 53 additions & 0 deletions tests/parse/test_html_parser_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest
from openviking.parse.parsers.html import HTMLParser


class TestHTMLParserRawUrlConversion:
"""Test suite for HTMLParser._convert_to_raw_url method."""

def setup_method(self):
self.parser = HTMLParser()

def test_github_blob_conversion(self):
blob_url = "https://github.com/volcengine/OpenViking/blob/main/docs/design.md"
expected = "https://raw.githubusercontent.com/volcengine/OpenViking/main/docs/design.md"
assert self.parser._convert_to_raw_url(blob_url) == expected

blob_deep = "https://github.com/user/repo/blob/feature/branch/src/components/Button.tsx"
expected_deep = (
"https://raw.githubusercontent.com/user/repo/feature/branch/src/components/Button.tsx"
)
assert self.parser._convert_to_raw_url(blob_deep) == expected_deep

def test_github_non_blob_urls(self):
repo_root = "https://github.com/volcengine/OpenViking"
assert self.parser._convert_to_raw_url(repo_root) == repo_root

issue_url = "https://github.com/volcengine/OpenViking/issues/1"
assert self.parser._convert_to_raw_url(issue_url) == issue_url

raw_url = "https://raw.githubusercontent.com/volcengine/OpenViking/main/README.md"
assert self.parser._convert_to_raw_url(raw_url) == raw_url

def test_gitlab_blob_conversion(self):
blob_url = "https://gitlab.com/gitlab-org/gitlab/-/blob/master/README.md"
expected = "https://gitlab.com/gitlab-org/gitlab/-/raw/master/README.md"
assert self.parser._convert_to_raw_url(blob_url) == expected

blob_deep = "https://gitlab.com/group/project/-/blob/dev/src/main.rs"
expected_deep = "https://gitlab.com/group/project/-/raw/dev/src/main.rs"
assert self.parser._convert_to_raw_url(blob_deep) == expected_deep

def test_gitlab_non_blob_urls(self):
root = "https://gitlab.com/gitlab-org/gitlab"
assert self.parser._convert_to_raw_url(root) == root

issue = "https://gitlab.com/gitlab-org/gitlab/-/issues/123"
assert self.parser._convert_to_raw_url(issue) == issue

def test_other_domains(self):
url = "https://example.com/blob/main/file.txt"
assert self.parser._convert_to_raw_url(url) == url

bitbucket = "https://bitbucket.org/user/repo/src/master/README.md"
assert self.parser._convert_to_raw_url(bitbucket) == bitbucket
Loading