Skip to content

Commit 454e727

Browse files
aeromomoOpenClaw Integration
andauthored
fix: improve binary content detection and null byte handling (#210)
- Add binary content detection based on null byte percentage (>5%) - Add control character validation to avoid processing binary files as text - Remove null bytes from decoded text content to prevent downstream issues - Add logging for binary content detection and null byte removal This prevents potential issues when processing files that contain null bytes or other binary data that could cause problems in text processing pipelines. The fix uses a 5% threshold for null bytes to distinguish between text files with occasional null bytes and truly binary content. Co-authored-by: OpenClaw Integration <openclaw@example.com>
1 parent 3be6d0a commit 454e727

File tree

1 file changed

+19
-1
lines changed

1 file changed

+19
-1
lines changed

openviking/parse/parsers/upload_utils.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,24 @@ def detect_and_convert_encoding(content: bytes, file_path: Union[str, Path] = ""
6262
return content
6363

6464
try:
65+
# Check for potential binary content (null bytes in first 8KB)
66+
# Binary files often contain null bytes which can cause issues
67+
sample_size = min(8192, len(content))
68+
if b'\x00' in content[:sample_size]:
69+
null_count = content[:sample_size].count(b'\x00')
70+
# If more than 5% null bytes in sample, likely binary - don't process
71+
if null_count / sample_size > 0.05:
72+
logger.debug(f"Detected binary content in {file_path} (null bytes: {null_count}), skipping encoding detection")
73+
return content
74+
6575
detected_encoding: Optional[str] = None
6676
for encoding in TEXT_ENCODINGS:
6777
try:
68-
content.decode(encoding)
78+
decoded = content.decode(encoding)
79+
# Additional validation: check for control characters that suggest binary
80+
control_chars = sum(1 for c in decoded[:1000] if ord(c) < 32 and c not in '\t\n\r')
81+
if control_chars / min(1000, len(decoded)) > 0.05: # More than 5% control chars
82+
continue
6983
detected_encoding = encoding
7084
break
7185
except UnicodeDecodeError:
@@ -77,6 +91,10 @@ def detect_and_convert_encoding(content: bytes, file_path: Union[str, Path] = ""
7791

7892
if detected_encoding not in UTF8_VARIANTS:
7993
decoded_content = content.decode(detected_encoding, errors="replace")
94+
# Remove null bytes from decoded content as they can cause issues downstream
95+
if '\x00' in decoded_content:
96+
decoded_content = decoded_content.replace('\x00', '')
97+
logger.debug(f"Removed null bytes from decoded content in {file_path}")
8098
content = decoded_content.encode("utf-8")
8199
logger.debug(f"Converted {file_path} from {detected_encoding} to UTF-8")
82100

0 commit comments

Comments
 (0)