From 9aa3d031feb0d84314e0d77789c7c5eb2f0dfdf7 Mon Sep 17 00:00:00 2001
From: Ivana Kellyerova <ivana.kellyerova@sentry.io>
Date: Tue, 9 Jan 2024 12:09:18 +0100
Subject: [PATCH 1/6] Fix UnicodeDecodeError on Python 2.7

---
 sentry_sdk/utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
index d547e363b6..7c1f0707cc 100644
--- a/sentry_sdk/utils.py
+++ b/sentry_sdk/utils.py
@@ -1126,7 +1126,13 @@ def strip_string(value, max_length=None):
     if max_length is None:
         max_length = DEFAULT_MAX_VALUE_LENGTH
 
-    length = len(value.encode("utf-8"))
+    length = len(value)
+    if isinstance(value, text_type):
+        # we want the size in bytes rather than characters, if possible
+        try:
+            length = len(value.encode("utf-8"))
+        except UnicodeDecodeError:
+            pass
 
     if length > max_length:
         return AnnotatedValue(

From 3ce6e1947120ebfddf5b307f8b407fdb82054d65 Mon Sep 17 00:00:00 2001
From: Ivana Kellyerova <ivana.kellyerova@sentry.io>
Date: Tue, 9 Jan 2024 12:24:17 +0100
Subject: [PATCH 2/6] test

---
 tests/utils/test_general.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py
index 6f53de32c3..45886b12d6 100644
--- a/tests/utils/test_general.py
+++ b/tests/utils/test_general.py
@@ -591,3 +591,7 @@ def test_strip_string():
     text_with_unicode_character = u"éê"
     assert strip_string(text_with_unicode_character, max_length=2).value == u"é..."
     # fmt: on
+
+    # This was causing UnicodeDecodeErrors in Python 2
+    text_with_unicode_character = "éê"
+    assert strip_string(text_with_unicode_character, max_length=2).value == "éê"

From 5f7cc1189b5811407f5f0c07c4940efd1138d015 Mon Sep 17 00:00:00 2001
From: Ivana Kellyerova <ivana.kellyerova@sentry.io>
Date: Wed, 10 Jan 2024 15:21:54 +0100
Subject: [PATCH 3/6] wip

---
 sentry_sdk/utils.py         | 57 +++++++++++++++++++++++---------
 tests/utils/test_general.py | 66 ++++++++++++++++++++++++++-----------
 2 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
index 7c1f0707cc..4954800232 100644
--- a/sentry_sdk/utils.py
+++ b/sentry_sdk/utils.py
@@ -1118,6 +1118,29 @@ def _is_in_project_root(abs_path, project_root):
     return False
 
 
+def _truncate_string_by_bytes(string, max_bytes):
+    # type: (str, int) -> str
+    """
+    Truncate a UTF-8 encodable string to the last full codepoint so that it fits in max_bytes.
+    """
+    truncated = string.encode("utf-8")[: max_bytes - 3].decode("utf-8", errors="ignore")
+    return truncated + "..."
+
+
+def _get_size_in_bytes(value):
+    # type: (Union[str, bytes]) -> Optional[int]
+    if not isinstance(value, (bytes, text_type)):
+        return None
+
+    if isinstance(value, bytes):
+        return len(value)
+
+    try:
+        return len(value.encode("utf-8"))
+    except UnicodeEncodeError:
+        return None
+
+
 def strip_string(value, max_length=None):
     # type: (str, Optional[int]) -> Union[AnnotatedValue, str]
     if not value:
@@ -1126,23 +1149,25 @@ def strip_string(value, max_length=None):
     if max_length is None:
         max_length = DEFAULT_MAX_VALUE_LENGTH
 
-    length = len(value)
-    if isinstance(value, text_type):
-        # we want the size in bytes rather than characters, if possible
-        try:
-            length = len(value.encode("utf-8"))
-        except UnicodeDecodeError:
-            pass
+    bytes_size = _get_size_in_bytes(value)
+    text_size = len(value)
 
-    if length > max_length:
-        return AnnotatedValue(
-            value=value[: max_length - 3] + "...",
-            metadata={
-                "len": length,
-                "rem": [["!limit", "x", max_length - 3, max_length]],
-            },
-        )
-    return value
+    if bytes_size and bytes_size > max_length:
+        # truncate to max_length bytes, preserving code points
+        truncated_value = _truncate_string_by_bytes(value, max_length)
+    elif text_size and text_size > max_length:
+        # fallback to truncating by string length
+        truncated_value = value[: max_length - 3] + "..."
+    else:
+        return value
+
+    return AnnotatedValue(
+        value=truncated_value,
+        metadata={
+            "len": bytes_size or text_size,
+            "rem": [["!limit", "x", max_length - 3, max_length]],
+        },
+    )
 
 
 def parse_version(version):
diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py
index 45886b12d6..f38a14db60 100644
--- a/tests/utils/test_general.py
+++ b/tests/utils/test_general.py
@@ -17,6 +17,8 @@
     set_in_app_in_frames,
     strip_string,
     AnnotatedValue,
+    _get_size_in_bytes,
+    _truncate_string_by_bytes,
 )
 from sentry_sdk._compat import text_type, string_types
 
@@ -572,26 +574,52 @@ def test_failed_base64_conversion(input):
         assert to_base64(input) is None
 
 
-def test_strip_string():
-    # If value is None returns None.
-    assert strip_string(None) is None
+@pytest.mark.parametrize(
+    "input,max_length,result",
+    [
+        [None, None, None],
+        ["a" * 256, None, "a" * 256],
+        [
+            "a" * 257,
+            256,
+            AnnotatedValue(
+                value="a" * 253 + "...",
+                metadata={"len": 257, "rem": [["!limit", "x", 253, 256]]},
+            ),
+        ],
+        # fmt: off
+        [u"éêéê", None, u"éêéê"],
+        [u"éêéê", 4, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]})],
+        # fmt: on
+        ["éêéê", None, "éêéê"],
+        [
+            "éêéê",
+            4,
+            AnnotatedValue(
+                value="é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]}
+            ),
+        ],
+    ],
+)
+def test_strip_string(input, max_length, result):
+    assert strip_string(input, max_length) == result
 
-    # If max_length is not passed, returns the full text (up to 1024 bytes).
-    text_1024_long = "a" * 1024
-    assert strip_string(text_1024_long).count("a") == 1024
 
-    # If value exceeds the max_length, returns an AnnotatedValue.
-    text_1025_long = "a" * 1025
-    stripped_text = strip_string(text_1025_long)
-    assert isinstance(stripped_text, AnnotatedValue)
-    assert stripped_text.value.count("a") == 1021  # + '...' is 1024
+@pytest.mark.parametrize(
+    "input,max_bytes,result",
+    [
+        [None, None],
+    ],
+)
+def test_truncate_by_bytes(input, max_bytes, result):
+    assert _truncate_string_by_bytes(input, max_bytes) == result
 
-    # If text has unicode characters, it counts bytes and not number of characters.
-    # fmt: off
-    text_with_unicode_character = u"éê"
-    assert strip_string(text_with_unicode_character, max_length=2).value == u"é..."
-    # fmt: on
 
-    # This was causing UnicodeDecodeErrors in Python 2
-    text_with_unicode_character = "éê"
-    assert strip_string(text_with_unicode_character, max_length=2).value == "éê"
+@pytest.mark.parametrize(
+    "input,result",
+    [
+        ["abc", 3],
+    ],
+)
+def test_get_size_in_bytes(input, max_bytes, result):
+    assert _get_size_in_bytes(input, max_bytes) == result

From 6a697d6190f852f4057a0bfbb8dda3fb9607470e Mon Sep 17 00:00:00 2001
From: Ivana Kellyerova <ivana.kellyerova@sentry.io>
Date: Mon, 15 Jan 2024 16:01:05 +0100
Subject: [PATCH 4/6] wip

---
 sentry_sdk/utils.py         | 33 +++++++++++++++++++++++----------
 tests/utils/test_general.py | 34 ++++++----------------------------
 2 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
index 4954800232..a77f72c3a4 100644
--- a/sentry_sdk/utils.py
+++ b/sentry_sdk/utils.py
@@ -382,6 +382,15 @@ def __init__(self, value, metadata):
         self.value = value
         self.metadata = metadata
 
+    def __eq__(self, other):
+        if not isinstance(other, AnnotatedValue):
+            return False
+
+        return self.value == other.value and self.metadata == other.metadata
+
+    def __repr__(self):
+        return self.value
+
     @classmethod
     def removed_because_raw_data(cls):
         # type: () -> AnnotatedValue
@@ -1118,10 +1127,10 @@ def _is_in_project_root(abs_path, project_root):
     return False
 
 
-def _truncate_string_by_bytes(string, max_bytes):
-    # type: (str, int) -> str
+def _truncate_by_bytes(string, max_bytes):
+    # type: (Union[str, bytes], int) -> str
     """
-    Truncate a UTF-8 encodable string to the last full codepoint so that it fits in max_bytes.
+    Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
     """
     truncated = string.encode("utf-8")[: max_bytes - 3].decode("utf-8", errors="ignore")
     return truncated + "..."
@@ -1129,6 +1138,8 @@ def _truncate_string_by_bytes(string, max_bytes):
 
 def _get_size_in_bytes(value):
     # type: (Union[str, bytes]) -> Optional[int]
+    # XXX: broken 'unicodehere' py2 -- can't be encoded
+    # XXX remove repr
     if not isinstance(value, (bytes, text_type)):
         return None
 
@@ -1137,7 +1148,7 @@ def _get_size_in_bytes(value):
 
     try:
         return len(value.encode("utf-8"))
-    except UnicodeEncodeError:
+    except (UnicodeEncodeError, UnicodeDecodeError):
         return None
 
 
@@ -1149,13 +1160,15 @@ def strip_string(value, max_length=None):
     if max_length is None:
         max_length = DEFAULT_MAX_VALUE_LENGTH
 
-    bytes_size = _get_size_in_bytes(value)
-    text_size = len(value)
+    byte_size = _get_size_in_bytes(value)
+    text_size = None
+    if isinstance(value, text_type):
+        text_size = len(value)
 
-    if bytes_size and bytes_size > max_length:
+    if byte_size is not None and byte_size > max_length:
         # truncate to max_length bytes, preserving code points
-        truncated_value = _truncate_string_by_bytes(value, max_length)
-    elif text_size and text_size > max_length:
+        truncated_value = _truncate_by_bytes(value, max_length)
+    elif text_size is not None and text_size > max_length:
         # fallback to truncating by string length
         truncated_value = value[: max_length - 3] + "..."
     else:
@@ -1164,7 +1177,7 @@ def strip_string(value, max_length=None):
     return AnnotatedValue(
         value=truncated_value,
         metadata={
-            "len": bytes_size or text_size,
+            "len": byte_size or text_size,
             "rem": [["!limit", "x", max_length - 3, max_length]],
         },
     )
diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py
index f38a14db60..d4067bd5c6 100644
--- a/tests/utils/test_general.py
+++ b/tests/utils/test_general.py
@@ -17,8 +17,6 @@
     set_in_app_in_frames,
     strip_string,
     AnnotatedValue,
-    _get_size_in_bytes,
-    _truncate_string_by_bytes,
 )
 from sentry_sdk._compat import text_type, string_types
 
@@ -588,38 +586,18 @@ def test_failed_base64_conversion(input):
             ),
         ],
         # fmt: off
-        [u"éêéê", None, u"éêéê"],
-        [u"éêéê", 4, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]})],
+        [u"éééé", None, u"éééé"],
+        [u"éééé", 5, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]})],
         # fmt: on
-        ["éêéê", None, "éêéê"],
+        ["éééé", None, "éééé"],
         [
-            "éêéê",
-            4,
+            "éééé",
+            5,
             AnnotatedValue(
-                value="é...", metadata={"len": 8, "rem": [["!limit", "x", 1, 4]]}
+                value="é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]}
             ),
         ],
     ],
 )
 def test_strip_string(input, max_length, result):
     assert strip_string(input, max_length) == result
-
-
-@pytest.mark.parametrize(
-    "input,max_bytes,result",
-    [
-        [None, None],
-    ],
-)
-def test_truncate_by_bytes(input, max_bytes, result):
-    assert _truncate_string_by_bytes(input, max_bytes) == result
-
-
-@pytest.mark.parametrize(
-    "input,result",
-    [
-        ["abc", 3],
-    ],
-)
-def test_get_size_in_bytes(input, max_bytes, result):
-    assert _get_size_in_bytes(input, max_bytes) == result

From e999e9a0061ba615531d4ca7e50ba5940a9199fb Mon Sep 17 00:00:00 2001
From: Ivana Kellyerova <ivana.kellyerova@sentry.io>
Date: Thu, 18 Jan 2024 18:20:50 +0100
Subject: [PATCH 5/6] wip

---
 sentry_sdk/utils.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
index a77f72c3a4..b860e524f0 100644
--- a/sentry_sdk/utils.py
+++ b/sentry_sdk/utils.py
@@ -388,9 +388,6 @@ def __eq__(self, other):
 
         return self.value == other.value and self.metadata == other.metadata
 
-    def __repr__(self):
-        return self.value
-
     @classmethod
     def removed_because_raw_data(cls):
         # type: () -> AnnotatedValue
@@ -1128,18 +1125,26 @@ def _is_in_project_root(abs_path, project_root):
 
 
 def _truncate_by_bytes(string, max_bytes):
-    # type: (Union[str, bytes], int) -> str
+    # type: (str, int) -> str
     """
     Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
     """
-    truncated = string.encode("utf-8")[: max_bytes - 3].decode("utf-8", errors="ignore")
+    # This function technically supports bytes, but only for Python 2 compat.
+    # XXX remove support for bytes when we drop Python 2
+    if isinstance(string, bytes):
+        truncated = string[: max_bytes - 3]
+    else:
+        truncated = string.encode("utf-8")[: max_bytes - 3].decode(
+            "utf-8", errors="ignore"
+        )
+
     return truncated + "..."
 
 
 def _get_size_in_bytes(value):
-    # type: (Union[str, bytes]) -> Optional[int]
-    # XXX: broken 'unicodehere' py2 -- can't be encoded
-    # XXX remove repr
+    # type: (str) -> Optional[int]
+    # This function technically supports bytes, but only for Python 2 compat.
+    # XXX remove support for bytes when we drop Python 2
     if not isinstance(value, (bytes, text_type)):
         return None
 

From 17fb6d4c35c062712fdc96423665a2f543a5a403 Mon Sep 17 00:00:00 2001
From: Ivana Kellyerova <ivana.kellyerova@sentry.io>
Date: Thu, 18 Jan 2024 18:27:40 +0100
Subject: [PATCH 6/6] add missing type annotation

---
 sentry_sdk/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
index b860e524f0..f4d3c535ad 100644
--- a/sentry_sdk/utils.py
+++ b/sentry_sdk/utils.py
@@ -383,6 +383,7 @@ def __init__(self, value, metadata):
         self.metadata = metadata
 
     def __eq__(self, other):
+        # type: (Any) -> bool
         if not isinstance(other, AnnotatedValue):
             return False