From b42feb13ebdde8760af46d19229dfbea21aae330 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 17 May 2023 11:52:28 +0200 Subject: [PATCH 1/3] README-Unicode.md, README.md: Describe migration to Python3 open() Signed-off-by: Bernhard Kaindl --- README-Unicode.md | 192 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 12 +-- 2 files changed, 193 insertions(+), 11 deletions(-) create mode 100644 README-Unicode.md diff --git a/README-Unicode.md b/README-Unicode.md new file mode 100644 index 00000000..82e0999f --- /dev/null +++ b/README-Unicode.md @@ -0,0 +1,192 @@ +Python3 Unicode migration in the XCP package +============================================ + +Migrating `subprocess.Popen()` +------------------------------ +With Python3, the `stdin`, `stdout` and `stderr` pipes for `Popen()` default to `bytes`(binary mode). Binary mode is much safer because it foregoes the encode/decode. + +The for working with strings, existing users need to either enable text mode (when safe, it will attempt to decode and encode!) or be able to use bytes instead. + +For cases where the data is guaranteed to be pure ASCII, such as when resting the `proc.stdout` of `lspci -nm`, it is sufficient to use: +```py +open(["lspci, "-nm"], stdout=subprocess.PIPE, universal_newlines=True) +``` +This is possible because `universal_newlines=True` is accepted by Python2 and Python3. +On Python3, it also enables text mode for `subprocess.PIPE` streams (newline conversion +not needed, but text mode is needed) + +Migrating `builtins.open()` +--------------------------- +On Python3, `builtins.open()` can be used in a number of modes: +- Binary mode (when `"b"` is in `mode`): `read()` and `write()` use `bytes`. +- Text mode (Python3 default up to Python 3.6), when UTF-8 character encoding is not set by the locale +- UTF-8 mode (default since Python 3.7): https://peps.python.org/pep-0540/ + +When no Unicode locale in force, like in XAPI plugins, Python3 will be in text mode or UTF-8 (since Python 3.7, but existing XS is on 3.6): + +* By default, `read()` on files `open()`ed without selecting binary mode attempts + to decode the data into the Python3 Unicode string type. + This fails for binary data. + Any `ord() >= 128`, when no UTF-8 locale is active With Python 3.6, triggers `UnicodeDecodeError`. + +* Thus, all `open()` calls which might open binary files have to be converted to binary + or UTF-8 mode unless the caller is sure he is opening an ASCII file. + But even then, enabling an error handler to handle decoding errors is recommended: + ```py + open(filename, errors="replace") + ``` + But neither `errors=` nor `encoding=` is accepted by Python2, so a wrapper is likely best. + +### Binary mode + +When decoding bytes to strings is not needed, binary mode can be great because it side-steps string codecs. But, string operations involving string Literals have to be changed to bytes. + +However, when strings need to returned from the library, something like `bytes.decode(errors="ignore")` to get strings is needed. + +### Text mode + +Text mode using the `ascii` codec should be only used when it is ensured that the data can only consist of ASCII characters (0-127). Sadly, it is the default in Python 3.6 when the Python interpreter was not started using an UTF-8 locale for the LC_CTYPE locale category (set by LC_ALL, LC_CTYPE, LANG environment variables, overriding each other in that order) + +### UTF-8 mode + +Most if the time, the `UTF-8` codec should be used since even simple text files which are even documented to contain only ASCII characters like `"/usr/share/hwdata/pci.ids"` in fact __do__ contain UTF-8 characters. + +Some files or some output data from commands even contains legacy `ISO-8859-1` chars, and even the `UTF-8` codec would raise `UnicodeDecodeError` for these. +When this is known to be the case, `encoding="iso-8859-1` could be tried (not tested yet). + +### Problems + +With the locale set to C (XAPI plugins have that), Python's default mode changes +between 3.6 and 3.7: +```py +for i in 3.{6,7,10,11};do echo -n "3.$i: "; + LC_ALL=C python3.$i -c 'import locale,sys;print(locale.getpreferredencoding())';done +3.6: ANSI_X3.4-1968 +3.7: UTF-8 +3.10: UTF-8 +3.11: utf-8 +``` +This has the effect that in Python 3.6, the default codec for XAPI plugins is `ascii`: +```py +for i in 2.7 3.{6,7};do echo "$i:"; + LC_ALL=C python$i -c 'open("/usr/share/hwdata/pci.ids").read()';done +2.7: +3.6: +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib64/python3.6/encodings/ascii.py", line 26, in decode + return codecs.ascii_decode(input, self.errors)[0] +UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 97850: ordinal not in range(128) +3.7: +``` +This error means that the `'ascii' codec` cannot handle input ord() >= 128, and as some Video cards use `²` to reference their power, the `ascii` codec chokes on them. + +It means `xcp.pci.PCIIds()` cannot use `open("/usr/share/hwdata/pci.ids").read()`. + +While Python 3.7 and newer use UTF-8 mode by default, it does not set up an error handler for `UnicodeDecodeError`. + +As it happens, some older tools output ISO-8859-1 characters hard-coded and these aren't valid UTF-8 sequences, and even newer Python versions need error handlers to not fail: +```py +echo -e "\0262" # ISO-8859-1 for: "²" +python3 -c 'open(".text").read()' +Traceback (most recent call last): + File "", line 1, in + File "", line 322, in decode +UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 0: invalid start byte +``` + +```py +pylint -d all -e unspecified-encoding --msg-template="{path} line {line} in {obj}()" xcp/ tests/ +************* Module xcp.accessor +xcp/accessor.py line 165 in MountingAccessor.writeFile() +xcp/accessor.py line 240 in FileAccessor.writeFile() +************* Module xcp.bootloader +xcp/bootloader.py line 111 in Bootloader.readExtLinux() +xcp/bootloader.py line 219 in Bootloader.readGrub() +xcp/bootloader.py line 335 in Bootloader.readGrub2() +xcp/bootloader.py line 465 in Bootloader.writeExtLinux() +xcp/bootloader.py line 507 in Bootloader.writeGrub() +xcp/bootloader.py line 541 in Bootloader.writeGrub2() +************* Module xcp.cmd +xcp/cmd.py line 67 in OutputCache.fileContents() +************* Module xcp.dom0 +xcp/dom0.py line 85 in default_memory() +************* Module xcp.environ +xcp/environ.py line 48 in readInventory() +************* Module xcp.logger +xcp/logger.py line 51 in openLog() +************* Module xcp.net.ifrename.dynamic +xcp/net/ifrename/dynamic.py line 95 in DynamicRules.load_and_parse() +xcp/net/ifrename/dynamic.py line 292 in DynamicRules.save() +************* Module xcp.net.ifrename.static +xcp/net/ifrename/static.py line 118 in StaticRules.load_and_parse() +xcp/net/ifrename/static.py line 330 in StaticRules.save() +************* Module tests.test_biosdevname +tests/test_biosdevname.py line 30 in TestDeviceNames.test() +tests/test_biosdevname.py line 32 in TestDeviceNames.test() +************* Module tests.test_bootloader +tests/test_bootloader.py line 32 in TestLinuxBootloader.setUp() +tests/test_bootloader.py line 34 in TestLinuxBootloader.setUp() +tests/test_bootloader.py line 36 in TestLinuxBootloader.setUp() +tests/test_bootloader.py line 38 in TestLinuxBootloader.setUp() +************* Module tests.test_pci +tests/test_pci.py line 96 in TestPCIIds.test_videoclass_by_mock_calls() +tests/test_pci.py line 110 in TestPCIIds.mock_lspci_using_open_testfile() +``` +Of course, `xcp/net/ifrename` won't be affected but it would be good to fix the +warning for them as well in an intelligent way. See the proposal for that below. + +There are a couple of possibilities and Python because 2.7 does not support the +arguments we need to pass to ensure that all users of open() will work, we need +to make passing the arguments conditional on Python >= 3. + +1. Overriding `open()`, while technically working would not only affect xcp.python but the entire program: + ```py + if sys.version_info >= (3, 0): + original_open = __builtins__["open"] + def uopen(*args, **kwargs): + if "b" not in (args[1] \ + if len(args) >= 2 else kwargs.get("mode", "")): + kwargs.setdefault("encoding", "UTF-8") + kwargs.setdefault("errors", "replace") + return original_open(*args, **kwargs) + __builtins__["open"] = uopen + ``` +2. This is sufficient but is not very nice: + ```py + # xcp/utf8mode.py + if sys.version_info >= (3, 0): + open_utf8args = {"encoding": "utf-8", "errors": "replace"} + else: + open_utf8args = {} + # xcp/{cmd,pci,environ?,logger?}.py tests/test_{pci,biodevname?,...?}.py + + from .utf8mode import open_utf8args + ... + - open(filename) + + open(filename, **open_utf8args) + ``` + But, `pylint` will still warn about these lines, so I propose: + +3. Instead, use a wrapper function, which will also silence the `pylint` warnings at the locations which have been changed to use it: + ```py + # xcp/utf8mode.py + if sys.version_info >= (3, 0): + def utf8open(*args, **kwargs): + if len(args) > 1 and "b" in args[1]: + return open(*args, **kwargs) + return open(*args, encoding="utf-8", errors="replace", **kwargs) + else: + utf8open = open + # xcp/{cmd,pci,environ?,logger?}.py tests/test_{pci,biodevname?,...?}.py + + from .utf8mode import utf8open + ... + - open(filename) + + utf8open(filename) + ``` + +Using the 3rd option, the `pylint` warnings for the changed locations +`unspecified-encoding` and `consider-using-with` don't appear without +explicitly disabling them. + +PS: Since utf8open() still returns a context-manager, `with open(...) as f:` +would still work. \ No newline at end of file diff --git a/README.md b/README.md index 855d33b7..9637d929 100644 --- a/README.md +++ b/README.md @@ -172,17 +172,7 @@ Special open TODOs: ------------------- Charset encoding/string handling: -* With Python3, `read()` on files `open()`ed without specifying binary mode will attempt - to decode the data into the Python3 Unicode string type, which will fail for all - binary data. Thus all `open()` calls which might open binary files have to be converted - to binary mode by default unless the caller is sure he is opening an ASCII file, - even then, enabling an error handle to handle decoding errors is recommended. -* With Python3, the `stdin`, `stdout` and `stderr` pipes for `Popen()` default to - `bytes`(binary mode.) Binary mode is much safer because it foregoes the encode/decode. The existing users need to be able to enable text mode (when safe, it will attempt - to decode and encode!) or preferably be able to use bytes (which is the type behind Python2 strings too) instead. See these PRs for details: - * https://github.com/xenserver/python-libs/pull/22 - * https://github.com/xenserver/python-libs/pull/23 - * https://github.com/xenserver/python-libs/pull/24 + See [README-Unicode.md](README-Unicode.md) for details: * What's more: When code is called from a xapi plugin (such as ACK), when such code attempts to read text files like the `pciids` file, and there is a Unicode char it int, and the locale is not set up to be UTF-8 (because xapi plugins are started From ecbbb1f4ee166e7b5f6c39c979eea29cc1d074d0 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 17 May 2023 11:54:40 +0200 Subject: [PATCH 2/3] tests/test_pci.py: Add a 3rd GPU to the fake PCI bus for further tests Signed-off-by: Bernhard Kaindl --- tests/data/lspci-mn | 1 + tests/data/pci.ids | 1 + tests/test_pci.py | 28 ++++++++++++++++++++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/data/lspci-mn b/tests/data/lspci-mn index 4450e6d3..3cbd49fb 100644 --- a/tests/data/lspci-mn +++ b/tests/data/lspci-mn @@ -24,6 +24,7 @@ 03:00.0 "0380" "1002" "7340" -rc1 "1462" "12ac" 03:00.1 "0403" "1002" "ab38" "1462" "12ac" 04:00.0 "0280" "8086" "2723" -r1a "8086" "0084" +04:01.0 "0380" "1002" "67b0" "1787" "2020" 05:00.0 "0200" "10ec" "8168" -r15 "1462" "12ac" 06:00.0 "0108" "144d" "a808" -p02 "144d" "a801" 07:00.0 "0300" "1002" "1636" -rc6 "1462" "12ac" diff --git a/tests/data/pci.ids b/tests/data/pci.ids index acaccbb1..e3873068 100644 --- a/tests/data/pci.ids +++ b/tests/data/pci.ids @@ -11,6 +11,7 @@ 1314 Wrestler HDMI Audio 174b 1001 PURE Fusion Mini 1636 Renoir + 67b0 Hawaii XT / Grenada XT [Radeon R9 290X/390X] 7340 Navi 14 [Radeon RX 5500/5500M / Pro 5500M] # List of known device classes, subclasses and programming interfaces diff --git a/tests/test_pci.py b/tests/test_pci.py index 9764887c..237f5096 100644 --- a/tests/test_pci.py +++ b/tests/test_pci.py @@ -116,18 +116,34 @@ def assert_videoclass_devices(self, ids, devs): # type: (PCIIds, PCIDevices) -> self.assertEqual(video_class, ["03"]) sorted_devices = sorted(devs.findByClass(video_class), key=lambda x: x['id']) - self.assertEqual(len(sorted_devices), 2) + # Assert devs.findByClass() finding 3 GPUs from tests/data/lspci-mn in our mocked PCIIds DB: + self.assertEqual(len(sorted_devices), 3) + + # For each of the found devices, assert these expected values: for (video_dev, num_functions, vendor, device, ) in zip(sorted_devices, - (1, 5), - ("Advanced Micro Devices, Inc. [AMD/ATI]", - "Advanced Micro Devices, Inc. [AMD/ATI]"), - ("Navi 14 [Radeon RX 5500/5500M / Pro 5500M]", - "Renoir"), + # 1: Number of other PCI device functions shown by mocked lspci in this PCI slot: + ( + 1, + 0, + 5, + ), + # 2: GPU Vendor + ( + "Advanced Micro Devices, Inc. [AMD/ATI]", + "Advanced Micro Devices, Inc. [AMD/ATI]", + "Advanced Micro Devices, Inc. [AMD/ATI]", + ), + # 3: GPU Device name + ( + "Navi 14 [Radeon RX 5500/5500M / Pro 5500M]", + "Hawaii XT / Grenada XT [Radeon R9 290X/390X]", + "Renoir", + ), ): self.assertEqual(len(devs.findRelatedFunctions(video_dev['id'])), num_functions) self.assertEqual(ids.findVendor(video_dev['vendor']), vendor) From daab4d16f7e92f11c0e23d192354d9c777a45dae Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Fri, 19 May 2023 11:33:06 +0200 Subject: [PATCH 3/3] .pci.PCIIds.read(): Fix reading pci.ids with UTF-8 chars and test it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ACK XAPI plugin calls xcp.pci.PCIIds.read() without an UTF-8 locale set. Problem: Because there are Unicode characters ('²') in `/usr/share/hwdata/pci.ids`, it needs to be opened with UTF-8 decoding set up for `read()` to succeed. But, because XAPI plugins are not started with the locale set to an UTF-8 locale this fails on Python 3.6 because in this case, it defaults to using the `ascii` codec which does not support `ord()` >= 128. Thus, `xcp.pci.PCIIds.read()` fails to read the PCI IDs database when no locale is set on Python 3.6. Fix: Wrap the open() call using a function which enables the UTF-8 encoding when run on Python 3.0 or newer. The use of the wrapper function also fixes the pylint warning `unspecified-encoding` for not passing `encoding="utf-8"`. Signed-off-by: Bernhard Kaindl --- tests/data/pci.ids | 1 + tests/test_pci.py | 21 +++++++++++++++++++-- tox.ini | 1 + xcp/compat.py | 21 +++++++++++++++++++++ xcp/pci.py | 4 +++- 5 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 xcp/compat.py diff --git a/tests/data/pci.ids b/tests/data/pci.ids index e3873068..a8a38dfc 100644 --- a/tests/data/pci.ids +++ b/tests/data/pci.ids @@ -12,6 +12,7 @@ 174b 1001 PURE Fusion Mini 1636 Renoir 67b0 Hawaii XT / Grenada XT [Radeon R9 290X/390X] + 1787 2020 R9 290X IceQ X² Ultra Turbo Overdrive³ USB 3.2 SuperSpeed Edition 7340 Navi 14 [Radeon RX 5500/5500M / Pro 5500M] # List of known device classes, subclasses and programming interfaces diff --git a/tests/test_pci.py b/tests/test_pci.py index 237f5096..e136bacc 100644 --- a/tests/test_pci.py +++ b/tests/test_pci.py @@ -1,3 +1,7 @@ +# -*- coding: utf-8 -*- +# pylint: disable=unspecified-encoding # we pass encoding using kwargs where needed + + import subprocess import unittest from os import environ @@ -5,6 +9,7 @@ import pyfakefs.fake_filesystem_unittest # type: ignore[import] from mock import patch, Mock +from xcp.compat import open_utf8 from xcp.pci import PCI, PCIIds, PCIDevices class TestInvalid(unittest.TestCase): @@ -87,8 +92,8 @@ def test_videoclass_without_mock(self): def test_videoclass_by_mock_calls(self): with patch("xcp.pci.os.path.exists") as exists_mock, \ - patch("xcp.pci.open") as open_mock, \ - open("tests/data/pci.ids") as fake_data: + patch("xcp.pci.utf8open") as open_mock, \ + open("tests/data/pci.ids", **open_utf8) as fake_data: # type: ignore[call-overload] exists_mock.return_value = True open_mock.return_value.__iter__ = Mock(return_value=iter(fake_data)) ids = PCIIds.read() @@ -125,6 +130,7 @@ def assert_videoclass_devices(self, ids, devs): # type: (PCIIds, PCIDevices) -> num_functions, vendor, device, + subdevice, ) in zip(sorted_devices, # 1: Number of other PCI device functions shown by mocked lspci in this PCI slot: ( @@ -144,9 +150,20 @@ def assert_videoclass_devices(self, ids, devs): # type: (PCIIds, PCIDevices) -> "Hawaii XT / Grenada XT [Radeon R9 290X/390X]", "Renoir", ), + # 4: GPU Subdevice name + ( + None, + "R9 290X IceQ X² Ultra Turbo Overdrive³ USB 3.2 SuperSpeed Edition", + None, + ), ): self.assertEqual(len(devs.findRelatedFunctions(video_dev['id'])), num_functions) self.assertEqual(ids.findVendor(video_dev['vendor']), vendor) self.assertEqual(ids.findDevice(video_dev['vendor'], video_dev['device']), device) + # Expect that we can lookup the subdevice and get the name of the subdevice, if found: + self.assertEqual( + ids.findSubdevice(video_dev["subvendor"], video_dev["subdevice"]), + subdevice, + ) self.assertEqual(len(devs.findRelatedFunctions('00:18.1')), 7) diff --git a/tox.ini b/tox.ini index 413e578c..8da8abf2 100644 --- a/tox.ini +++ b/tox.ini @@ -73,6 +73,7 @@ passenv = fox: XAUTHORITY fox: DBUS_SESSION_BUS_ADDRESS setenv = + LC_ALL=C # Ensure that xcp is tested without an locale (like XAPI plugins) PYTHONDEVMODE=yes # Enables development/resource checks: eg unclosed files and more PYTHONPATH=stubs # Inhibit dev-warnings on pytest plugins, but we enable dev-warnings in conftest.py diff --git a/xcp/compat.py b/xcp/compat.py new file mode 100644 index 00000000..48690ff8 --- /dev/null +++ b/xcp/compat.py @@ -0,0 +1,21 @@ +"""Helper module for providing common defaults on how to enable UTF-8 I/O in Py3.6 and newer""" +# See README-Unicode.md for Details +import sys + +if sys.version_info >= (3, 0): # pragma: no cover + open_utf8 = {"encoding": "utf-8", "errors": "replace"} + + def utf8open(filename, *args, **kwargs): + """Helper for open(): Handle UTF-8: Default to encoding="utf-8", errors="replace" for Py3""" + if "b" in (args[0] if args else kwargs.get("mode", "")): + # Binary mode: just call open() unmodified: + return open(filename, *args, **kwargs) # pylint: disable=unspecified-encoding + # Text mode: default to UTF-8 with error handling to replace malformed UTF-8 sequences + kwargs.setdefault("encoding", "utf-8") # Needed for Python 3.6 when no UTF-8 locale is set + kwargs.setdefault("errors", "replace") # Simple codec error handler: Replace malformed char + # pylint: disable-next=unspecified-encoding + return open(filename, *args, **kwargs) # type: ignore[call-overload] +else: + # Python2.7: None of the above is either supported or relevant (strings are bytes): + open_utf8 = {} + utf8open = open diff --git a/xcp/pci.py b/xcp/pci.py index 9bb64aa9..4610632b 100644 --- a/xcp/pci.py +++ b/xcp/pci.py @@ -26,6 +26,8 @@ import re import six +from .compat import utf8open + _SBDF = (r"(?:(?P [\da-dA-F]{4}):)?" # Segment (optional) r" (?P [\da-fA-F]{2}):" # Bus r" (?P [\da-fA-F]{2})\." # Device @@ -185,7 +187,7 @@ def __init__(self, fn): vendor = None cls = None - fh = open(fn) + fh = utf8open(fn) for l in fh: line = l.rstrip() if line == '' or line.startswith('#'):