]> git.proxmox.com Git - mirror_linux-firmware.git/commitdiff
Try both utf-8 and windows-1252 for decoding email
authorMario Limonciello <mario.limonciello@amd.com>
Thu, 16 Nov 2023 16:42:10 +0000 (10:42 -0600)
committerMario Limonciello <mario.limonciello@amd.com>
Thu, 16 Nov 2023 16:42:10 +0000 (10:42 -0600)
Recent submissions from Cirrus were classified as spam by the lore
analysis robot script.  This is because cirrus used windows-1252 for
the encoding which failed to decode as utf-8.

Try both encodings when decoding email.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
contrib/process_linux_firmware.py

index 668e35c0eb0600fcd4cde19c560a4b32a8dd8542..ea108391d44d125b300f0f3df56980e8b7ef032d 100755 (executable)
@@ -34,6 +34,8 @@ content_types = {
 def classify_content(content):
     # load content into the email library
     msg = email.message_from_string(content)
+    decoded = None
+    body = None
 
     # check the subject
     subject = msg["Subject"]
@@ -42,17 +44,28 @@ def classify_content(content):
     if "PATCH" in subject:
         return ContentType.PATCH
 
-    for part in msg.walk():
-        if part.get_content_type() == "text/plain":
+    if msg.is_multipart():
+        for part in msg.walk():
+            if part.get_content_type() == "text/plain":
+                body = part.get_payload(decode=True)
+    else:
+        body = msg.get_payload(decode=True)
+
+    if body:
+        for encoding in ["utf-8", "windows-1252"]:
             try:
-                body = part.get_payload(decode=True).decode("utf-8")
-                for key in content_types.keys():
-                    if key in body:
-                        return content_types[key]
-                break
-            except UnicodeDecodeError as e:
-                logging.warning("Failed to decode email: %s, treating as SPAM" % e)
+                decoded = body.decode(encoding)
                 break
+            except UnicodeDecodeError:
+                pass
+
+    if decoded:
+        for key in content_types.keys():
+            if key in decoded:
+                return content_types[key]
+    else:
+        logging.warning("Failed to decode email: %s, treating as SPAM", body)
+
     return ContentType.SPAM