Skip to content

Commit dc5da23

Browse files
authored
feat: Add C# AST extractor support (#366)
- Add CSharpExtractor class with support for: - Classes, interfaces, structs, records - Methods, constructors, properties - XML documentation comments (/// and /** */) - Namespaces and file-scoped namespaces - using directives - Register .cs file extension and csharp extractor - Add tree-sitter-c-sharp dependency - Add comprehensive tests aligned with other language extractors
1 parent 4a56147 commit dc5da23

File tree

4 files changed

+327
-0
lines changed

4 files changed

+327
-0
lines changed

openviking/parse/parsers/code/ast/extractor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
".hpp": "cpp",
2828
".rs": "rust",
2929
".go": "go",
30+
".cs": "csharp",
3031
}
3132

3233
# Language key → (module path, class name, constructor kwargs)
@@ -38,6 +39,7 @@
3839
"cpp": ("openviking.parse.parsers.code.ast.languages.cpp", "CppExtractor", {}),
3940
"rust": ("openviking.parse.parsers.code.ast.languages.rust", "RustExtractor", {}),
4041
"go": ("openviking.parse.parsers.code.ast.languages.go", "GoExtractor", {}),
42+
"csharp": ("openviking.parse.parsers.code.ast.languages.csharp", "CSharpExtractor", {}),
4143
}
4244

4345

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
2+
# SPDX-License-Identifier: Apache-2.0
3+
"""C# AST extractor using tree-sitter-c-sharp."""
4+
5+
import re
6+
from typing import List
7+
8+
from openviking.parse.parsers.code.ast.languages.base import LanguageExtractor
9+
from openviking.parse.parsers.code.ast.skeleton import ClassSkeleton, CodeSkeleton, FunctionSig
10+
11+
12+
def _node_text(node, content_bytes: bytes) -> str:
13+
return content_bytes[node.start_byte : node.end_byte].decode("utf-8", errors="replace")
14+
15+
16+
def _parse_doc_comment(raw: str) -> str:
17+
"""Strip XML doc comment markers (/// or /** */) and extract text from XML tags."""
18+
raw = raw.strip()
19+
if raw.startswith("///"):
20+
lines = raw.split("\n")
21+
cleaned = []
22+
for line in lines:
23+
stripped = line.strip()
24+
if stripped.startswith("///"):
25+
stripped = stripped[3:].strip()
26+
if stripped:
27+
cleaned.append(stripped)
28+
raw = " ".join(cleaned)
29+
elif raw.startswith("/**"):
30+
raw = raw[3:]
31+
if raw.endswith("*/"):
32+
raw = raw[:-2]
33+
lines = [l.strip().lstrip("*").strip() for l in raw.split("\n")]
34+
raw = "\n".join(l for l in lines if l).strip()
35+
# Remove XML tags
36+
raw = re.sub(r"</?[a-zA-Z][a-zA-Z0-9]*(?:\s+[^>]*)?/?>", "", raw)
37+
# Normalize whitespace
38+
raw = re.sub(r"\s+", " ", raw).strip()
39+
return raw
40+
41+
42+
def _preceding_doc(siblings: list, idx: int, content_bytes: bytes) -> str:
43+
"""Return XML doc comment immediately before siblings[idx], or ''."""
44+
if idx == 0:
45+
return ""
46+
comments = []
47+
for i in range(idx - 1, -1, -1):
48+
prev = siblings[i]
49+
if prev.type == "comment":
50+
text = _node_text(prev, content_bytes)
51+
if text.strip().startswith("///") or text.strip().startswith("/**"):
52+
comments.insert(0, _parse_doc_comment(text))
53+
else:
54+
break
55+
elif prev.type in ("preprocessor_directive", "nullable_directive"):
56+
continue
57+
else:
58+
break
59+
return "\n".join(comments) if comments else ""
60+
61+
62+
def _extract_method(node, content_bytes: bytes, docstring: str = "") -> FunctionSig:
63+
name = ""
64+
params = ""
65+
return_type = ""
66+
67+
for child in node.children:
68+
if child.type == "identifier" and not name:
69+
name = _node_text(child, content_bytes)
70+
elif child.type == "void_keyword":
71+
return_type = "void"
72+
elif child.type in ("predefined_type", "type_identifier", "generic_name"):
73+
if not return_type:
74+
return_type = _node_text(child, content_bytes)
75+
elif child.type == "parameter_list":
76+
raw = _node_text(child, content_bytes).strip()
77+
if raw.startswith("(") and raw.endswith(")"):
78+
raw = raw[1:-1]
79+
params = raw.strip()
80+
81+
if node.type == "property_declaration":
82+
for child in node.children:
83+
if child.type == "accessor_list":
84+
accessors = []
85+
for acc in child.children:
86+
if acc.type == "accessor_declaration":
87+
accessor_name = ""
88+
name_node = acc.child_by_field_name("name")
89+
if name_node is not None:
90+
accessor_name = _node_text(name_node, content_bytes).strip()
91+
else:
92+
for sub in acc.children:
93+
if sub.type in ("get", "set", "init"):
94+
accessor_name = sub.type
95+
break
96+
if accessor_name in ("get", "set", "init"):
97+
accessors.append(accessor_name)
98+
if accessors:
99+
params = f"{{ {' '.join(accessors)} }}"
100+
101+
return FunctionSig(name=name, params=params, return_type=return_type, docstring=docstring)
102+
103+
104+
def _extract_class(node, content_bytes: bytes, docstring: str = "") -> ClassSkeleton:
105+
name = ""
106+
bases: List[str] = []
107+
body_node = None
108+
109+
for child in node.children:
110+
if child.type == "identifier" and not name:
111+
name = _node_text(child, content_bytes)
112+
elif child.type == "base_list":
113+
for sub in child.children:
114+
if sub.type in ("type_identifier", "identifier"):
115+
bases.append(_node_text(sub, content_bytes))
116+
elif child.type == "declaration_list":
117+
body_node = child
118+
119+
methods: List[FunctionSig] = []
120+
if body_node:
121+
siblings = list(body_node.children)
122+
for idx, child in enumerate(siblings):
123+
if child.type in ("method_declaration", "constructor_declaration"):
124+
doc = _preceding_doc(siblings, idx, content_bytes)
125+
methods.append(_extract_method(child, content_bytes, docstring=doc))
126+
elif child.type == "property_declaration":
127+
doc = _preceding_doc(siblings, idx, content_bytes)
128+
methods.append(_extract_method(child, content_bytes, docstring=doc))
129+
130+
return ClassSkeleton(name=name, bases=bases, docstring=docstring, methods=methods)
131+
132+
133+
class CSharpExtractor(LanguageExtractor):
134+
def __init__(self):
135+
import tree_sitter_c_sharp as tscsharp
136+
from tree_sitter import Language, Parser
137+
138+
self._language = Language(tscsharp.language())
139+
self._parser = Parser(self._language)
140+
141+
def extract(self, file_name: str, content: str) -> CodeSkeleton:
142+
content_bytes = content.encode("utf-8")
143+
tree = self._parser.parse(content_bytes)
144+
root = tree.root_node
145+
146+
imports: List[str] = []
147+
classes: List[ClassSkeleton] = []
148+
functions: List[FunctionSig] = []
149+
150+
siblings = list(root.children)
151+
for idx, child in enumerate(siblings):
152+
if child.type == "using_directive":
153+
for sub in child.children:
154+
if sub.type == "identifier":
155+
imports.append(_node_text(sub, content_bytes))
156+
elif sub.type == "qualified_name":
157+
imports.append(_node_text(sub, content_bytes))
158+
elif child.type in ("namespace_declaration", "file_scoped_namespace_declaration"):
159+
for sub in child.children:
160+
if sub.type == "declaration_list":
161+
ns_siblings = list(sub.children)
162+
for ns_idx, ns_child in enumerate(ns_siblings):
163+
if ns_child.type in (
164+
"class_declaration",
165+
"interface_declaration",
166+
"struct_declaration",
167+
"record_declaration",
168+
):
169+
doc = _preceding_doc(ns_siblings, ns_idx, content_bytes)
170+
classes.append(
171+
_extract_class(ns_child, content_bytes, docstring=doc)
172+
)
173+
elif child.type in (
174+
"class_declaration",
175+
"interface_declaration",
176+
"struct_declaration",
177+
"record_declaration",
178+
):
179+
doc = _preceding_doc(siblings, idx, content_bytes)
180+
classes.append(_extract_class(child, content_bytes, docstring=doc))
181+
182+
return CodeSkeleton(
183+
file_name=file_name,
184+
language="C#",
185+
module_doc="",
186+
imports=imports,
187+
classes=classes,
188+
functions=functions,
189+
)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ dependencies = [
6666
"tree-sitter-cpp>=0.23.0",
6767
"tree-sitter-rust>=0.23.0",
6868
"tree-sitter-go>=0.23.0",
69+
"tree-sitter-c-sharp>=0.23.0",
6970
]
7071

7172
[tool.uv.sources]

tests/parse/test_ast_extractor.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ def _ts_extractor():
3030
return JsTsExtractor(lang="typescript")
3131

3232

33+
def _csharp_extractor():
34+
from openviking.parse.parsers.code.ast.languages.csharp import CSharpExtractor
35+
return CSharpExtractor()
36+
37+
3338

3439
# ---------------------------------------------------------------------------
3540
# Python
@@ -452,6 +457,130 @@ def test_to_text_verbose(self):
452457
assert "@return sum of a and b" in text
453458

454459

460+
# ---------------------------------------------------------------------------
461+
# C#
462+
# ---------------------------------------------------------------------------
463+
464+
class TestCSharpExtractor:
465+
SAMPLE = """
466+
using System;
467+
using System.Collections.Generic;
468+
469+
namespace MyApp.Services
470+
{
471+
/// <summary>
472+
/// A simple calculator service.
473+
///
474+
/// Supports basic arithmetic operations.
475+
/// </summary>
476+
public class Calculator
477+
{
478+
/// <summary>
479+
/// Add two integers.
480+
///
481+
/// <param name=\"a\">First operand</param>
482+
/// <param name=\"b\">Second operand</param>
483+
/// <returns>Sum of a and b</returns>
484+
/// </summary>
485+
public int Add(int a, int b)
486+
{
487+
return a + b;
488+
}
489+
490+
/// <summary>
491+
/// Subtract b from a.
492+
/// </summary>
493+
public int Subtract(int a, int b)
494+
{
495+
return a - b;
496+
}
497+
}
498+
}
499+
"""
500+
501+
def setup_method(self):
502+
self.e = _csharp_extractor()
503+
504+
def test_imports(self):
505+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
506+
assert "System" in sk.imports
507+
assert "System.Collections.Generic" in sk.imports
508+
509+
def test_class_extracted(self):
510+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
511+
names = {c.name for c in sk.classes}
512+
assert "Calculator" in names
513+
514+
def test_class_docstring(self):
515+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
516+
cls = next(c for c in sk.classes if c.name == "Calculator")
517+
assert "simple calculator service" in cls.docstring
518+
assert "Supports basic arithmetic" in cls.docstring
519+
520+
def test_methods_extracted(self):
521+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
522+
cls = next(c for c in sk.classes if c.name == "Calculator")
523+
methods = {m.name: m for m in cls.methods}
524+
assert "Add" in methods
525+
assert "Subtract" in methods
526+
527+
def test_method_docstring(self):
528+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
529+
cls = next(c for c in sk.classes if c.name == "Calculator")
530+
methods = {m.name: m for m in cls.methods}
531+
assert "Add two integers." in methods["Add"].docstring
532+
assert "First operand" in methods["Add"].docstring
533+
534+
def test_to_text_compact(self):
535+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
536+
text = sk.to_text(verbose=False)
537+
assert "# Calculator.cs [C#]" in text
538+
assert "class Calculator" in text
539+
assert "+ Add(" in text
540+
assert "First operand" not in text
541+
542+
def test_to_text_verbose(self):
543+
sk = self.e.extract("Calculator.cs", self.SAMPLE)
544+
text = sk.to_text(verbose=True)
545+
assert "simple calculator service" in text
546+
assert "First operand" in text
547+
548+
def test_file_scoped_namespace(self):
549+
code = '''
550+
using System;
551+
552+
namespace MyApp.Services;
553+
554+
public class Calculator
555+
{
556+
public int Add(int a, int b)
557+
{
558+
return a + b;
559+
}
560+
}
561+
'''
562+
sk = self.e.extract("Calculator.cs", code)
563+
names = {c.name for c in sk.classes}
564+
assert "Calculator" in names
565+
566+
def test_property_accessor_signature(self):
567+
code = '''
568+
public class Calculator
569+
{
570+
/// <summary>
571+
/// Current result.
572+
/// </summary>
573+
public int Result { get; set; }
574+
}
575+
'''
576+
sk = self.e.extract("Calculator.cs", code)
577+
cls = next(c for c in sk.classes if c.name == "Calculator")
578+
methods = {m.name: m for m in cls.methods}
579+
assert "Result" in methods
580+
assert "get" in methods["Result"].params
581+
assert "set" in methods["Result"].params
582+
583+
455584
# ---------------------------------------------------------------------------
456585
# C/C++
457586
# ---------------------------------------------------------------------------
@@ -853,6 +982,12 @@ def test_go_dispatch(self):
853982
assert "# main.go [Go]" in text
854983
assert "Run" in text
855984

985+
def test_csharp_dispatch(self):
986+
code = 'namespace Demo;\n\npublic class Util { public int Add(int a, int b) { return a + b; } }\n'
987+
text = self.extractor.extract_skeleton("util.cs", code)
988+
assert "# util.cs [C#]" in text
989+
assert "class Util" in text
990+
856991
def test_unknown_extension_returns_none(self):
857992
code = "def foo(x): pass\nclass Bar: pass\n"
858993
result = self.extractor.extract_skeleton("script.lua", code)

0 commit comments

Comments
 (0)