From 9aa3d031feb0d84314e0d77789c7c5eb2f0dfdf7 Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Tue, 9 Jan 2024 12:09:18 +0100 Subject: [PATCH 1/6] Fix UnicodeDecodeError on Python 2.7 --- sentry_sdk/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index d547e363b6..7c1f0707cc 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -1126,7 +1126,13 @@ def strip_string(value, max_length=None): if max_length is None: max_length = DEFAULT_MAX_VALUE_LENGTH - length = len(value.encode("utf-8")) + length = len(value) + if isinstance(value, text_type): + # we want the size in bytes rather than characters, if possible + try: + length = len(value.encode("utf-8")) + except UnicodeDecodeError: + pass if length > max_length: return AnnotatedValue( From 3ce6e1947120ebfddf5b307f8b407fdb82054d65 Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Tue, 9 Jan 2024 12:24:17 +0100 Subject: [PATCH 2/6] test --- tests/utils/test_general.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py index 6f53de32c3..45886b12d6 100644 --- a/tests/utils/test_general.py +++ b/tests/utils/test_general.py @@ -591,3 +591,7 @@ def test_strip_string(): text_with_unicode_character = u"éê" assert strip_string(text_with_unicode_character, max_length=2).value == u"é..." # fmt: on + + # This was causing UnicodeDecodeErrors in Python 2 + text_with_unicode_character = "éê" + assert strip_string(text_with_unicode_character, max_length=2).value == "éê" From 5f7cc1189b5811407f5f0c07c4940efd1138d015 Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Wed, 10 Jan 2024 15:21:54 +0100 Subject: [PATCH 3/6] wip --- sentry_sdk/utils.py | 57 +++++++++++++++++++++++--------- tests/utils/test_general.py | 66 ++++++++++++++++++++++++++----------- 2 files changed, 88 insertions(+), 35 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 7c1f0707cc..4954800232 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -1118,6 +1118,29 @@ def _is_in_project_root(abs_path, project_root): return False +def _truncate_string_by_bytes(string, max_bytes): + # type: (str, int) -> str + """ + Truncate a UTF-8 encodable string to the last full codepoint so that it fits in max_bytes. + """ + truncated = string.encode("utf-8")[: max_bytes - 3].decode("utf-8", errors="ignore") + return truncated + "..." + + +def _get_size_in_bytes(value): + # type: (Union[str, bytes]) -> Optional[int] + if not isinstance(value, (bytes, text_type)): + return None + + if isinstance(value, bytes): + return len(value) + + try: + return len(value.encode("utf-8")) + except UnicodeEncodeError: + return None + + def strip_string(value, max_length=None): # type: (str, Optional[int]) -> Union[AnnotatedValue, str] if not value: @@ -1126,23 +1149,25 @@ def strip_string(value, max_length=None): if max_length is None: max_length = DEFAULT_MAX_VALUE_LENGTH - length = len(value) - if isinstance(value, text_type): - # we want the size in bytes rather than characters, if possible - try: - length = len(value.encode("utf-8")) - except UnicodeDecodeError: - pass + bytes_size = _get_size_in_bytes(value) + text_size = len(value) - if length > max_length: - return AnnotatedValue( - value=value[: max_length - 3] + "...", - metadata={ - "len": length, - "rem": [["!limit", "x", max_length - 3, max_length]], - }, - ) - return value + if bytes_size and bytes_size > max_length: + # truncate to max_length bytes, preserving code points + truncated_value = _truncate_string_by_bytes(value, max_length) + elif text_size and text_size > max_length: + # fallback to truncating by string length + truncated_value = value[: max_length - 3] + "..." + else: + return value + + return AnnotatedValue( + value=truncated_value, + metadata={ + "len": bytes_size or text_size, + "rem": [["!limit", "x", max_length - 3, max_length]], + }, + ) def parse_version(version): diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py index 45886b12d6..f38a14db60 100644 --- a/tests/utils/test_general.py +++ b/tests/utils/test_general.py @@ -17,6 +17,8 @@ set_in_app_in_frames, strip_string, AnnotatedValue, + _get_size_in_bytes, + _truncate_string_by_bytes, ) from sentry_sdk._compat import text_type, string_types @@ -572,26 +574,52 @@ def test_failed_base64_conversion(input): assert to_base64(input) is None -def test_strip_string(): - # If value is None returns None. - assert strip_string(None) is None +@pytest.mark.parametrize( + "input,max_length,result", + [ + [None, None, None], + ["a" * 256, None, "a" * 256], + [ + "a" * 257, + 256, + AnnotatedValue( + value="a" * 253 + "...", + metadata={"len": 257, "rem": [["!limit", "x", 253, 256]]}, + ), + ], + # fmt: off + [u"éêéê", None, u"éêéê"], + [u"éêéê", 4, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]})], + # fmt: on + ["éêéê", None, "éêéê"], + [ + "éêéê", + 4, + AnnotatedValue( + value="é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]} + ), + ], + ], +) +def test_strip_string(input, max_length, result): + assert strip_string(input, max_length) == result - # If max_length is not passed, returns the full text (up to 1024 bytes). - text_1024_long = "a" * 1024 - assert strip_string(text_1024_long).count("a") == 1024 - # If value exceeds the max_length, returns an AnnotatedValue. - text_1025_long = "a" * 1025 - stripped_text = strip_string(text_1025_long) - assert isinstance(stripped_text, AnnotatedValue) - assert stripped_text.value.count("a") == 1021 # + '...' is 1024 +@pytest.mark.parametrize( + "input,max_bytes,result", + [ + [None, None], + ], +) +def test_truncate_by_bytes(input, max_bytes, result): + assert _truncate_string_by_bytes(input, max_bytes) == result - # If text has unicode characters, it counts bytes and not number of characters. - # fmt: off - text_with_unicode_character = u"éê" - assert strip_string(text_with_unicode_character, max_length=2).value == u"é..." - # fmt: on - # This was causing UnicodeDecodeErrors in Python 2 - text_with_unicode_character = "éê" - assert strip_string(text_with_unicode_character, max_length=2).value == "éê" +@pytest.mark.parametrize( + "input,result", + [ + ["abc", 3], + ], +) +def test_get_size_in_bytes(input, max_bytes, result): + assert _get_size_in_bytes(input, max_bytes) == result From 6a697d6190f852f4057a0bfbb8dda3fb9607470e Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Mon, 15 Jan 2024 16:01:05 +0100 Subject: [PATCH 4/6] wip --- sentry_sdk/utils.py | 33 +++++++++++++++++++++++---------- tests/utils/test_general.py | 34 ++++++---------------------------- 2 files changed, 29 insertions(+), 38 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 4954800232..a77f72c3a4 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -382,6 +382,15 @@ def __init__(self, value, metadata): self.value = value self.metadata = metadata + def __eq__(self, other): + if not isinstance(other, AnnotatedValue): + return False + + return self.value == other.value and self.metadata == other.metadata + + def __repr__(self): + return self.value + @classmethod def removed_because_raw_data(cls): # type: () -> AnnotatedValue @@ -1118,10 +1127,10 @@ def _is_in_project_root(abs_path, project_root): return False -def _truncate_string_by_bytes(string, max_bytes): - # type: (str, int) -> str +def _truncate_by_bytes(string, max_bytes): + # type: (Union[str, bytes], int) -> str """ - Truncate a UTF-8 encodable string to the last full codepoint so that it fits in max_bytes. + Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes. """ truncated = string.encode("utf-8")[: max_bytes - 3].decode("utf-8", errors="ignore") return truncated + "..." @@ -1129,6 +1138,8 @@ def _truncate_string_by_bytes(string, max_bytes): def _get_size_in_bytes(value): # type: (Union[str, bytes]) -> Optional[int] + # XXX: broken 'unicodehere' py2 -- can't be encoded + # XXX remove repr if not isinstance(value, (bytes, text_type)): return None @@ -1137,7 +1148,7 @@ def _get_size_in_bytes(value): try: return len(value.encode("utf-8")) - except UnicodeEncodeError: + except (UnicodeEncodeError, UnicodeDecodeError): return None @@ -1149,13 +1160,15 @@ def strip_string(value, max_length=None): if max_length is None: max_length = DEFAULT_MAX_VALUE_LENGTH - bytes_size = _get_size_in_bytes(value) - text_size = len(value) + byte_size = _get_size_in_bytes(value) + text_size = None + if isinstance(value, text_type): + text_size = len(value) - if bytes_size and bytes_size > max_length: + if byte_size is not None and byte_size > max_length: # truncate to max_length bytes, preserving code points - truncated_value = _truncate_string_by_bytes(value, max_length) - elif text_size and text_size > max_length: + truncated_value = _truncate_by_bytes(value, max_length) + elif text_size is not None and text_size > max_length: # fallback to truncating by string length truncated_value = value[: max_length - 3] + "..." else: @@ -1164,7 +1177,7 @@ def strip_string(value, max_length=None): return AnnotatedValue( value=truncated_value, metadata={ - "len": bytes_size or text_size, + "len": byte_size or text_size, "rem": [["!limit", "x", max_length - 3, max_length]], }, ) diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py index f38a14db60..d4067bd5c6 100644 --- a/tests/utils/test_general.py +++ b/tests/utils/test_general.py @@ -17,8 +17,6 @@ set_in_app_in_frames, strip_string, AnnotatedValue, - _get_size_in_bytes, - _truncate_string_by_bytes, ) from sentry_sdk._compat import text_type, string_types @@ -588,38 +586,18 @@ def test_failed_base64_conversion(input): ), ], # fmt: off - [u"éêéê", None, u"éêéê"], - [u"éêéê", 4, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]})], + [u"éééé", None, u"éééé"], + [u"éééé", 5, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]})], # fmt: on - ["éêéê", None, "éêéê"], + ["éééé", None, "éééé"], [ - "éêéê", - 4, + "éééé", + 5, AnnotatedValue( - value="é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]} + value="é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]} ), ], ], ) def test_strip_string(input, max_length, result): assert strip_string(input, max_length) == result - - -@pytest.mark.parametrize( - "input,max_bytes,result", - [ - [None, None], - ], -) -def test_truncate_by_bytes(input, max_bytes, result): - assert _truncate_string_by_bytes(input, max_bytes) == result - - -@pytest.mark.parametrize( - "input,result", - [ - ["abc", 3], - ], -) -def test_get_size_in_bytes(input, max_bytes, result): - assert _get_size_in_bytes(input, max_bytes) == result From e999e9a0061ba615531d4ca7e50ba5940a9199fb Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Thu, 18 Jan 2024 18:20:50 +0100 Subject: [PATCH 5/6] wip --- sentry_sdk/utils.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index a77f72c3a4..b860e524f0 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -388,9 +388,6 @@ def __eq__(self, other): return self.value == other.value and self.metadata == other.metadata - def __repr__(self): - return self.value - @classmethod def removed_because_raw_data(cls): # type: () -> AnnotatedValue @@ -1128,18 +1125,26 @@ def _is_in_project_root(abs_path, project_root): def _truncate_by_bytes(string, max_bytes): - # type: (Union[str, bytes], int) -> str + # type: (str, int) -> str """ Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes. """ - truncated = string.encode("utf-8")[: max_bytes - 3].decode("utf-8", errors="ignore") + # This function technically supports bytes, but only for Python 2 compat. + # XXX remove support for bytes when we drop Python 2 + if isinstance(string, bytes): + truncated = string[: max_bytes - 3] + else: + truncated = string.encode("utf-8")[: max_bytes - 3].decode( + "utf-8", errors="ignore" + ) + return truncated + "..." def _get_size_in_bytes(value): - # type: (Union[str, bytes]) -> Optional[int] - # XXX: broken 'unicodehere' py2 -- can't be encoded - # XXX remove repr + # type: (str) -> Optional[int] + # This function technically supports bytes, but only for Python 2 compat. + # XXX remove support for bytes when we drop Python 2 if not isinstance(value, (bytes, text_type)): return None From 17fb6d4c35c062712fdc96423665a2f543a5a403 Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Thu, 18 Jan 2024 18:27:40 +0100 Subject: [PATCH 6/6] add missing type annotation --- sentry_sdk/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index b860e524f0..f4d3c535ad 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -383,6 +383,7 @@ def __init__(self, value, metadata): self.metadata = metadata def __eq__(self, other): + # type: (Any) -> bool if not isinstance(other, AnnotatedValue): return False