config: add spam option for extract_text

author Stoiko Ivanov <s.ivanov@proxmox.com>

Mon, 13 Mar 2023 21:23:48 +0000 (22:23 +0100)

committer Thomas Lamprecht <t.lamprecht@proxmox.com>

Wed, 15 Mar 2023 15:51:28 +0000 (16:51 +0100)
author Stoiko Ivanov <s.ivanov@proxmox.com>
Mon, 13 Mar 2023 21:23:48 +0000 (22:23 +0100)
committer Thomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 15 Mar 2023 15:51:28 +0000 (16:51 +0100)
diff --git a/debian/control b/debian/control

index 93ad72cab812bbf39ca501e61e48a35921f2627c..d2ed7da0a67a3f5d76e7379362c267f4c0306509 100644 (file)
--- a/debian/control
+++ b/debian/control
@@ -98,7 +98,14 @@ Depends: apt (>= 2~),
           ucf,
           ${misc:Depends},
           ${perl:Depends},
-Recommends: ifupdown2, proxmox-offline-mirror-helper
+Recommends: antiword,
+            docx2txt,
+            ifupdown2,
+            odt2txt,
+            poppler-utils,
+            proxmox-offline-mirror-helper,
+            tesseract-ocr,
+            unrtf
  Suggests: zfsutils-linux
  Description: Proxmox Mailgateway API Server Implementation
   This implements a REST API to configure Proxmox Mailgateway.
diff --git a/src/PMG/Config.pm b/src/PMG/Config.pm

index 5dcffb76d6fc9aaa6f382f88f101ced7f39e8d2f..699a622b9fecf405fbeb25e5a4e0c21bc2159264 100755 (executable)
--- a/src/PMG/Config.pm
+++ b/src/PMG/Config.pm
@@ -211,6 +211,11 @@ sub properties {
             minimum => 64,
             default => 256*1024,
         },
+       extract_text => {
+           description => "Extract text from attachments (doc, pdf, rtf, images) and scan for spam.",
+           type => 'boolean',
+           default => 0,
+       },
      };
  }
  
@@ -225,6 +230,7 @@ sub options {
         bounce_score => { optional => 1 },
         rbl_checks => { optional => 1 },
         maxspamsize => { optional => 1 },
+       extract_text => { optional => 1 },
      };
  }
  
diff --git a/src/templates/v400.pre.in b/src/templates/v400.pre.in

index 052e73e027664f018c07e4d63a36be875896e37c..4d68d6c8e7508948e320b3ccd9e95dbd03de00de 100644 (file)
--- a/src/templates/v400.pre.in
+++ b/src/templates/v400.pre.in
@@ -16,11 +16,37 @@
  # added to new files, named according to the release they're added in.
  ###########################################################################
  
+
+[% IF pmg.spam.extract_text %]
  # ExtractText - Extract text from documents or images for matching
-#
-# Requires manual configuration, see plugin documentation.
-#
-# loadplugin Mail::SpamAssassin::Plugin::ExtractText
+# informational headers and hits not configured
+loadplugin Mail::SpamAssassin::Plugin::ExtractText
+
+ifplugin Mail::SpamAssassin::Plugin::ExtractText
+
+  extracttext_external  pdftotext  /usr/bin/pdftotext -nopgbrk -layout -enc UTF-8 {} -
+  extracttext_use       pdftotext  .pdf application/pdf
+
+  # http://docx2txt.sourceforge.net
+  extracttext_external  docx2txt   /usr/bin/docx2txt {} -
+  extracttext_use       docx2txt   .docx application/docx
+
+  extracttext_external  antiword   /usr/bin/antiword -t -w 0 -m UTF-8.txt {}
+  extracttext_use       antiword   .doc application/(?:vnd\.?)?ms-?word.*
+
+  extracttext_external  unrtf      /usr/bin/unrtf --nopict {}
+  extracttext_use       unrtf      .doc .rtf application/rtf text/rtf
+
+  extracttext_external  odt2txt    /usr/bin/odt2txt --encoding=UTF-8 {}
+  extracttext_use       odt2txt    .odt .ott application/.*?opendocument.*text
+  extracttext_use       odt2txt    .sdw .stw application/(?:x-)?soffice application/(?:x-)?starwriter
+
+  extracttext_external  tesseract  {OMP_THREAD_LIMIT=1} /usr/bin/tesseract -c page_separator= {} -
+  extracttext_use       tesseract  .jpg .png .bmp .tif .tiff image/(?:jpeg|png|x-ms-bmp|tiff)
+
+endif
+
+[% END %]
  
  # DecodeShortUrl - Check for shortened URLs
  #
author	Stoiko Ivanov <s.ivanov@proxmox.com>
	Mon, 13 Mar 2023 21:23:48 +0000 (22:23 +0100)
committer	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Wed, 15 Mar 2023 15:51:28 +0000 (16:51 +0100)
debian/control		patch \| blob \| blame \| history
src/PMG/Config.pm		patch \| blob \| blame \| history
src/templates/v400.pre.in		patch \| blob \| blame \| history