]> git.proxmox.com Git - proxmox-spamassassin.git/blame - sa-updates/20_pdfinfo.cf
buildsys: use simple parenthesis for variables
[proxmox-spamassassin.git] / sa-updates / 20_pdfinfo.cf
CommitLineData
b780ea8d
SI
1# SpamAssassin rules file: Pdfinfo rules
2#
3# Please don't modify this file as your changes will be overwritten with
4# the next update. Use /etc/mail/spamassassin/local.cf instead.
5# See 'perldoc Mail::SpamAssassin::Conf' for details.
6#
7# <@LICENSE>
8# Licensed to the Apache Software Foundation (ASF) under one or more
9# contributor license agreements. See the NOTICE file distributed with
10# this work for additional information regarding copyright ownership.
11# The ASF licenses this file to you under the Apache License, Version 2.0
12# (the "License"); you may not use this file except in compliance with
13# the License. You may obtain a copy of the License at:
14#
15# http://www.apache.org/licenses/LICENSE-2.0
16#
17# Unless required by applicable law or agreed to in writing, software
18# distributed under the License is distributed on an "AS IS" BASIS,
19# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20# See the License for the specific language governing permissions and
21# limitations under the License.
22# </@LICENSE>
23#
24###########################################################################
25
26# 2014-12-02 - axb
27# Info and disabled rules kept for historical & documentation reasons
28# Updated rules may be added
29#
30# Original File: pdfinfo.cf
31# Original Version: 0.6
32# Info: $Id: pdfinfo.cf 895 2007-07-27 10:31:08Z alexb $
33# Created: 2007-06-25
34# Modified: 2007-07-19
35# Original / Defunct Site URL: http://www.rulesemporium.com/plugins.htm#PDFinfo
36# Author: Dallas Engelken (aka GMD :-)
37# Rules contributed by Alex Broens
38# Requires: PDFInfo.pm plugin
39# Description: This plugin/ruleset combination will help you alleviate the new
40# PDF based stock spam which began to appear mid-June, 2007.
41#
42#
43# Changes:
44#
45# 0.6 - added easypdf producer rule and more no body text metas
46# - tags support added, see USING TAGS below.
47# 0.5 - added fuzzy test 7
48# 0.4 - added new fuzzy for encyprted pdf image spams.
49# - added rule to check for encryption
50# 0.3 - added rules based on the new pdf_match_details() function
51# - added additional fuzzy md5 rules
52# - disabled static md5 rules as they are no longer hitting.
53# 0.2 - added static md5 to hit full page stock spam.
54# 0.1 - initial ruleset.
55#
56
57############################################
58# USING TAGS
59############################################
60
61# The follow tags can be defined in an add_header line
62#
63# _PDFCOUNT_ - total number of pdf mime parts in the email
64# _PDFIMGCOUNT_ - total number of images found inside pdf mime parts
65# _PDFVERSION_ - PDF Version, space seperated if there are > 1 pdf attachments
66# _PDFNAME_ - Filenames as found in the mime headers of PDF parts
67# _PDFPRODUCER_ - Producer/Application that created the PDF(s)
68# _PDFAUTHOR_ - Author of the PDF
69# _PDFCREATOR_ - Creator/Program that created the PDF(s)
70# _PDFTITLE_ - Title of the PDF File, if available
71# _PDFIMGDIM_ - If PDF Contains images, the dimensions of them will be put here
72# _PDFIMGAREA_ - The total area of all combined images inside the PDF(s)
73# _PDFMD5_ - MD5 checksum of PDF(s) - space seperated
74# _PDFMD5FUZZY1_- Fuzzy1 MD5 checksum of PDF(s) - space seperated
75# _PDFMD5FUZZY2_- Fuzzy2 MD5 checksum of PDF(s) - space seperated
76#
77# Example add_header lines
78#
79# add_header all PDF-Info pdf=_PDFCOUNT_, pdfimg=_PDFIMGCOUNT_, ver=_PDFVERSION_, name=_PDFNAME_
80# add_header all PDF-Details producer=_PDFPRODUCER_, author=_PDFAUTHOR_, creator=_PDFCREATOR_, title=_PDFTITLE_
81# add_header all PDF-ImageInfo dim=_PDFIMGDIM_, area=_PDFIMGAREA_
82# add_header all PDF-Md5 md5=_PDFMD5_, fuzzy1=_PDFMD5FUZZY1_, fuzzy2=_PDFMD5FUZZY2_
83#
84
85############################################
86# GENERIC RULE EXAMPLES SHOWING EVAL USAGE
87############################################
88
89# you can match by name
90# body MY_TEST_PDF eval:pdf_named('mytest.pdf')
91
92# or you can write a regex to match dynamic file names.
93# body MY_TEST_PDF eval:pdf_name_regex('/^(?:my|your)test\.pdf$/')
94
95# you can make it case insensitive by using modifiers
96# body PDF_IMGXXXXX eval:pdf_name_regex('/^IMG\D+\.\.PDF$/i')
97
98# you can do exact image size matches
99# body PDF_DEMS_150_400 eval:pdf_image_size_exact(150,400)
100
101# you can do image to text, or image to html ratios
102# rawbody PDF_TO_HTML_RATIO eval:pdf_image_to_text_ratio(0.000, 0.015)
103# body PDF_TO_TEXT_RATIO eval:pdf_image_to_text_ratio(0.000, 0.008)
104
105# you can do minimum demension matches
106# body PDF_SIZE_RANGE_1 eval:pdf_image_size_range(300,300)
107
108# you can do ranged demension matches
109# body PDF_SIZE_RANGE_2 eval:pdf_image_size_range(200, 300, 250, 350)
110
111# you can count the number of pdf mime partts
112# body PDF_MIME_COUNT_1 eval:pdf_count(1,1)
113# body PDF_MIME_COUNT_2_PLUS eval:pdf_count(2)
114
115# you can count the number of images inside the pdfs
116# body PDF_IMG_COUNT_1 eval:pdf_image_count(1,1)
117# body PDF_IMG_COUNT_2_PLUS eval:pdf_image_count(2)
118
119# you can determine pixel coverage
120# body PDF_AREA_SMALL eval:pdf_pixel_coverage(1,100000)
121
122
123# match a md5 or fuzzy md5 signature of the pdf
124
125# body PDF_BAD_MD5 eval:pdf_match_md5('C359F8F89B290DA99DC997ED50117CDF')
126# body PDF_BAD_FUZZY eval:pdf_match_fuzzy_md5('7340821445D975EEF6F5BDE2EC257900')
127
128# Now you can match against certain details if they are found in the PDF.
129# A regex match is used on the value specified, so if you want to do an
130# exact match, use anchors ^value$
131#
132# body GMD_AUTHOR_MOBILE eval:pdf_match_details('author','/^mobile$/')
133# body GMD_PRODUCER_GPL eval:pdf_match_details('producer','/(?i)^gpl ghostscript/')
134# body GMD_CREATOR_PSCRIPT5 eval:pdf_match_details('creator','/^PScript5/')
135# body GMD_TITLE_WORD_DOC1 eval:pdf_match_details('title','/^Microsoft Word \- Document1$/)
136# body GMD_CREATED_JULY07 eval:pdf_match_details('created','/^200707/')
137# body GMD_MODIFIED_JULY07 eval:pdf_match_details('modified','/^200707/')
138
139ifplugin Mail::SpamAssassin::Plugin::PDFInfo
140
141#######################################
142# DISABLED RULES, ENABLE IF YOU WANT
143#######################################
144
145# Small area
146# Disabled - Hits Ham
147# body GMD_PDF_SMALL_AREA eval:pdf_pixel_coverage(1,100000)
148# describe GMD_PDF_SMALL_AREA PDF Area covers 150k pixels or less
149# score GMD_PDF_SMALL_AREA 0.75
150# counts GMD_PDF_SMALL_AREA 51s/15h of 10615 corpus (5652s/4963h AxB) 06/25/07
151
152# NOTE - people do send pdf's without message bodies!
153# Disabled - Hits Ham
154# body GMD_PDF_NO_TXT eval:pdf_image_to_text_ratio(0.000, 0.005)
155# describe GMD_PDF_NO_TXT Low rawbody to pixel area ratio
156# score GMD_PDF_NO_TXT 0.01
157# counts GMD_PDF_NO_TXT 64s/3h of 10615 corpus (5652s/4963h AxB) 06/25/07
158
159####################################
160# HERE ARE THE LIVE RULES
161####################################
162
163
164
165######################################################################################################
166# pdf image dimensions
167
168# thin horizontal, common stox.
169body GMD_PDF_HORIZ eval:pdf_image_size_range(100, 450, 240, 800)
170describe GMD_PDF_HORIZ Contains pdf 100-240 (high) x 450-800 (wide)
171score GMD_PDF_HORIZ 0.25
172# counts GMD_PDF_HORIZ 135s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
173# counts GMD_PDF_HORIZ 278s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
174
175# near square, and small. common stox.
176body GMD_PDF_SQUARE eval:pdf_image_size_range(180, 180, 360, 360)
177describe GMD_PDF_SQUARE Contains pdf 180-360 (high) x 180-360 (wide)
178score GMD_PDF_SQUARE 0.50
179# counts GMD_PDF_SQUARE 36s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
180# counts GMD_PDF_SQUARE 46s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
181
182# thin vertical, very tall. common stox.
183body GMD_PDF_VERT eval:pdf_image_size_range(450, 100, 800, 240)
184describe GMD_PDF_VERT Contains pdf 450-800 (high) x 100-240 (wide)
185score GMD_PDF_VERT 0.90
186# counts GMD_PDF_VERT 24s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
187# counts GMD_PDF_VERT 10s/0h of 11773 corpus (10988s/785h AxB2-TRAPS) 07/11/07
188
189######################################################################################################
190# static checksums
191
192# all static md5 spam runs are complete as of 7/11
193# if there are more, we'll add new rules.
194
195# removed fuzzy rules dated 2007
196# Get fuzzy info:
197# cat msg.eml | spamassassin --debug pdfinfo 2>&1 | grep fuzzy 2>&1
198
199# sample rules ONLY
200# fuzzy checksum for bad stox
201#body GMD_PDF_FUZZY1_T1 eval:pdf_match_fuzzy_md5('57EBC1FFB1A24CC14AE23E1E227C3484')
202#describe GMD_PDF_FUZZY1_T1 Fuzzy MD5 Match 57EBC1FFB1A24CC14AE23E1E227C3484
203#score GMD_PDF_FUZZY1_T1 0.001
204
205# same as rule above using fuzzy md5 of pdf structure
206#body GMD_PDF_FUZZY2_T1 eval:pdf_match_fuzzy_md5('653C8AA9FDFD03D382523488058360A2')
207#describe GMD_PDF_FUZZY2_T1 Fuzzy MD5 Match 653C8AA9FDFD03D382523488058360A2
208#score GMD_PDF_FUZZY2_T1 0.001
209
210
211######################################################################################################
212# pdf_match_details()
213
214# from embedded link spam
215#body GMD_AUTHOR_COLET eval:pdf_match_details('author','/^colet$/')
216#describe GMD_AUTHOR_COLET PDF author was 'colet'
217#score GMD_AUTHOR_COLET 4.50
218# counts GMD_AUTHOR_COLET 1s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
219# counts GMD_AUTHOR_COLET 2s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
220
221# from full page pdf stock spammer.
222#body GMD_AUTHOR_MOBILE eval:pdf_match_details('author','/^mobile$/')
223#describe GMD_AUTHOR_MOBILE PDF author was 'mobile'
224#score GMD_AUTHOR_MOBILE 2.75
225# counts GMD_AUTHOR_MOBILE 2s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
226# counts GMD_AUTHOR_MOBILE 55s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
227
228# txt only stock spam
229#body GMD_AUTHOR_OOO eval:pdf_match_details('author','/^openofficeuser$/')
230#describe GMD_AUTHOR_OOO PDF author was 'openofficeuser'
231#score GMD_AUTHOR_OOO 1.75
232# counts GMD_AUTHOR_OOO 1s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
233# counts GMD_AUTHOR_OOO 118s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
234
235# txt only stock spam
236#body GMD_AUTHOR_HPADMIN eval:pdf_match_details('author','/^HP_Administrator/')
237#describe GMD_AUTHOR_HPADMIN PDF author was 'HP_Administrator'
238#score GMD_AUTHOR_HPADMIN 0.25
239# counts GMD_AUTHOR_HPADMIN 105s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
240# counts GMD_AUTHOR_HPADMIN 27s/0h of 11773 corpus (10988s/785h AxB2-TRAPS) 07/11/07
241
242# generic rule for software used to produce the pdf.
243body GMD_PRODUCER_GPL eval:pdf_match_details('producer','/^(?:gnu|gpl) ghostscript/i')
244describe GMD_PRODUCER_GPL PDF producer was GPL Ghostscript
245score GMD_PRODUCER_GPL 0.25
246# counts GMD_PRODUCER_GPL 227s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
247# counts GMD_PRODUCER_GPL 85s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
248
249# generic rule for software used to produce the pdf.
250body GMD_PRODUCER_POWERPDF eval:pdf_match_details('producer','/^PowerPdf 0\./')
251describe GMD_PRODUCER_POWERPDF PDF producer was PowerPDF
252score GMD_PRODUCER_POWERPDF 0.25
253# counts GMD_PRODUCER_POWERPDF 0s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
254# counts GMD_PRODUCER_POWERPDF 0s/0h of 5641 corpus (4064s/1577h AxB-MANUAL) 07/11/07
255
256# producer is bcl
257body GMD_PRODUCER_EASYPDF eval:pdf_match_details('producer','/^BCL easyPDF/')
258describe GMD_PRODUCER_EASYPDF PDF producer was BCL easyPDF
259score GMD_PRODUCER_EASYPDF 0.25
260
261# simple check for encryption used inside pdf.
262# recommend meta with something else...
263body GMD_PDF_ENCRYPTED eval:pdf_is_encrypted()
264describe GMD_PDF_ENCRYPTED Attached PDF is encrypted
265score GMD_PDF_ENCRYPTED 0.60
266# counts GMD_PDF_ENCRYPTED 13s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
267
268# simple check for empty msg body when there is one or more pdf attachments present.
269body GMD_PDF_EMPTY_BODY eval:pdf_is_empty_body()
270describe GMD_PDF_EMPTY_BODY Attached PDF with empty message body
271score GMD_PDF_EMPTY_BODY 0.25
272# counts GMD_PDF_EMPTY_BODY 1638s/20h of 27034 corpus (24636s/2398h AxB-MANUAL) 07/19/07
dfdd1e08 273priority GMD_PDF_EMPTY_BODY 2000 # workaround for Bug 8070
b780ea8d
SI
274
275######################################################################################################
276# metas
277#meta __GMD_PDF_CHECKSUM ( GMD_PDF_FUZZY1_T1 || GMD_PDF_FUZZY2_T1 || GMD_PDF_FUZZY2_T2 || GMD_PDF_FUZZY2_T3 || GMD_PDF_FUZZY2_T4 || GMD_PDF_FUZZY2_T5 || GMD_PDF_FUZZY2_T6 || GMD_PDF_FUZZY2_T7 ||GMD_PDF_FUZZY2_T9 || GMD_PDF_FUZZY2_T10 || GMD_PDF_FUZZY2_T11 || GMD_PDF_FUZZY2_T12 )
278#meta __GMD_PDF_DETAIL ( GMD_AUTHOR_COLET || GMD_AUTHOR_MOBILE || GMD_AUTHOR_OOO || GMD_AUTHOR_HPADMIN || GMD_PRODUCER_GPL || GMD_PRODUCER_POWERPDF || GMD_PRODUCER_EASYPDF )
279meta __GMD_PDF_DIMS ( GMD_PDF_VERT || GMD_PDF_HORIZ || GMD_PDF_SQUARE )
280meta __GMD_PDF_PRODUCERS ( GMD_PRODUCER_GPL || GMD_PRODUCER_POWERPDF || GMD_PRODUCER_EASYPDF )
281
282# rule hits ham by itself, so use just to meta.
283body __GMD_PDF_NO_TXT eval:pdf_image_to_text_ratio(0.000, 0.005)
284
285# meta checksum hit with image dimensions
286#meta GMD_PDF_STOX_M1 ( __GMD_PDF_CHECKSUM && __GMD_PDF_DIMS)
287#describe GMD_PDF_STOX_M1 PDF Stox spam
288#score GMD_PDF_STOX_M1 3.25
289# counts GMD_PDF_STOX_M1 159s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
290# counts GMD_PDF_STOX_M1 40s/0h of 11773 corpus (10988s/785h AxB2-TRAPS) 07/11/07
291
292# meta checksum hit to pdf details
293#meta GMD_PDF_STOX_M2 ( __GMD_PDF_CHECKSUM && __GMD_PDF_DETAIL )
294#describe GMD_PDF_STOX_M2 PDF Stox spam
295#score GMD_PDF_STOX_M2 2.95
296# counts GMD_PDF_STOX_M2 223s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
297# counts GMD_PDF_STOX_M2 29s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
298
299# meta dimensions and encryption
300#meta GMD_PDF_STOX_M3 ( __GMD_PDF_DIMS && GMD_PDF_ENCRYPTED )
301#describe GMD_PDF_STOX_M3 PDF Stox spam
302#score GMD_PDF_STOX_M3 2.25
303# counts GMD_PDF_STOX_M3 12s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
304
305# meta checksum with no text
306#meta GMD_PDF_STOX_M4 ( __GMD_PDF_CHECKSUM && (__GMD_PDF_NO_TXT || GMD_PDF_EMPTY_BODY))
307#describe GMD_PDF_STOX_M4 PDF Stox spam
308#score GMD_PDF_STOX_M4 2.95
309
310# meta no body text along with automated pdf production.
311#meta GMD_PDF_STOX_M5 ( __GMD_PDF_PRODUCERS && (__GMD_PDF_NO_TXT || GMD_PDF_EMPTY_BODY))
312#describe GMD_PDF_STOX_M5 PDF Stox Spam
313#score GMD_PDF_STOX_M5 1.00
314
315endif