/usr/share/pyshared/reverend/guessers/email.py is in python-reverend 0.4-0ubuntu1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | # This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
# amir@divmod.org. This is free software; you can redistribute it and/or
# modify it under the terms of version 2.1 of the GNU Lesser General Public
# License as published by the Free Software Foundation.
#
from rfc822 import AddressList
from reverend.thomas import Bayes
class EmailClassifier(Bayes):
def getTokens(self, msg):
# Overide from parent
# This should return a list of strings
# which will be used as the key into
# the table of token counts
tokens = self.getHeaderTokens(msg)
tokens += self.getBodyTokens(msg)
# Get some tokens that are generated from the
# header and the structure
tokens += self.getMetaTokens(msg)
return tokens
def getBodyTokens(self, msg):
text = self.getTextPlain(msg)
if text is None:
text = ''
tl = list(self._tokenizer.tokenize(text))
return tl
def getHeaderTokens(self, msg):
subj = msg.get('subject','nosubject')
text = subj + ' '
text += msg.get('from','fromnoone') + ' '
text += msg.get('to','tonoone') + ' '
text += msg.get('cc','ccnoone') + ' '
tl = list(self._tokenizer.tokenize(text))
return tl
def getTextPlain(self, msg):
for part in msg.walk():
typ = part.get_content_type()
if typ and typ.lower() == "text/plain":
text = part.get_payload(decode=True)
return text
return None
def getTextHtml(self, msg):
for part in msg.walk():
typ = part.get_content_type()
if typ and typ.lower() == "text/html":
text = part.get_payload(decode=False)
return text
return None
def getMetaTokens(self, msg):
r = []
for f in ['Content-type', 'X-Priority', 'X-Mailer',
'content-transfer-encoding', 'X-MSMail-Priority']:
r.append(f +':' + msg.get(f, 'None'))
text = self.getTextPlain(msg)
html = self.getTextHtml(msg)
for stem, part in zip(['text','html'],[text,html]):
if part is None:
r.append(stem + '_None')
continue
else:
r.append(stem + '_True')
l = len(part.split())
if l is 0:
a = 'zero'
r.append(stem + a)
if l > 10000:
a = 'more_than_10000'
r.append(stem + a)
if l > 1000:
a = 'more_than_1000'
r.append(stem + a)
if l > 100:
a = 'more_than_100'
r.append(stem + a)
t = msg.get('to','')
at = AddressList(t).addresslist
c = msg.get('cc','')
ac = AddressList(c).addresslist
if at > 5:
r.append('to_more_than_5')
if at > 10:
r.append('to_more_than_10')
if ac > 5:
r.append('cc_more_than_5')
if ac > 10:
r.append('cc_more_than_10')
return r
|