/usr/bin/sb_mailsort is in spambayes 1.1b1-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | #! /usr/bin/python
"""\
To train:
%(program)s -t ham.mbox spam.mbox
To filter mail (using .forward or .qmail):
|%(program)s Maildir/ Mail/Spam/
To print the score and top evidence for a message or messages:
%(program)s -s message [message ...]
"""
SPAM_CUTOFF = 0.57
SIZE_LIMIT = 5000000 # messages larger are not analyzed
BLOCK_SIZE = 10000
RC_DIR = "~/.spambayes"
DB_FILE = RC_DIR + "/wordprobs.cdb"
CONFIG_FILE = RC_DIR + "/bayescustomize.ini"
import sys
import os
import getopt
import email
import time
import signal
import socket
import errno
DB_FILE = os.path.expanduser(DB_FILE)
def import_spambayes():
global mboxutils, CdbClassifier, tokenize
if not os.environ.has_key('BAYESCUSTOMIZE'):
os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE)
from spambayes import mboxutils
from spambayes.cdb_classifier import CdbClassifier
from spambayes.tokenizer import tokenize
program = sys.argv[0] # For usage(); referenced by docstring above
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
def maketmp(dir):
hostname = socket.gethostname()
pid = os.getpid()
fd = -1
for x in xrange(200):
filename = "%d.%d.%s" % (time.time(), pid, hostname)
pathname = "%s/tmp/%s" % (dir, filename)
try:
fd = os.open(pathname, os.O_WRONLY|os.O_CREAT|os.O_EXCL, 0600)
except IOError, exc:
if exc[0] not in (errno.EINT, errno.EEXIST):
raise
else:
break
time.sleep(2)
if fd == -1:
raise SystemExit, "could not create a mail file"
return (os.fdopen(fd, "wb"), pathname, filename)
def train(bayes, msgs, is_spam):
"""Train bayes with all messages from a mailbox."""
mbox = mboxutils.getmbox(msgs)
for msg in mbox:
bayes.learn(tokenize(msg), is_spam)
def train_messages(ham_name, spam_name):
"""Create database using messages."""
rc_dir = os.path.expanduser(RC_DIR)
if not os.path.exists(rc_dir):
print "Creating", RC_DIR, "directory..."
os.mkdir(rc_dir)
bayes = CdbClassifier()
print 'Training with ham...'
train(bayes, ham_name, False)
print 'Training with spam...'
train(bayes, spam_name, True)
print 'Update probabilities and writing DB...'
db = open(DB_FILE, "wb")
bayes.save_wordinfo(db)
db.close()
print 'done'
def filter_message(hamdir, spamdir):
signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1))
signal.alarm(24 * 60 * 60)
# write message to temporary file (must be on same partition)
tmpfile, pathname, filename = maketmp(hamdir)
try:
tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line
bytes = 0
blocks = []
while 1:
block = sys.stdin.read(BLOCK_SIZE)
if not block:
break
bytes += len(block)
if bytes < SIZE_LIMIT:
blocks.append(block)
tmpfile.write(block)
tmpfile.close()
if bytes < SIZE_LIMIT:
msgdata = ''.join(blocks)
del blocks
msg = email.message_from_string(msgdata)
del msgdata
bayes = CdbClassifier(open(DB_FILE, 'rb'))
prob = bayes.spamprob(tokenize(msg))
else:
prob = 0.0
if prob > SPAM_CUTOFF:
os.rename(pathname, "%s/new/%s" % (spamdir, filename))
else:
os.rename(pathname, "%s/new/%s" % (hamdir, filename))
except:
os.unlink(pathname)
raise
def print_message_score(msg_name, msg_fp):
msg = email.message_from_file(msg_fp)
bayes = CdbClassifier(open(DB_FILE, 'rb'))
prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
print msg_name, prob
for word, prob in evidence:
print ' ', repr(word), prob
def main():
global DB_FILE, CONFIG_FILE
try:
opts, args = getopt.getopt(sys.argv[1:], 'tsd:c:')
except getopt.error, msg:
usage(2, msg)
mode = 'sort'
for opt, val in opts:
if opt == '-t':
mode = 'train'
elif opt == '-s':
mode = 'score'
elif opt == '-d':
DB_FILE = val
elif opt == '-c':
CONFIG_FILE = val
else:
assert 0, 'invalid option'
import_spambayes()
if mode == 'sort':
if len(args) != 2:
usage(2, 'wrong number of arguments')
filter_message(args[0], args[1])
elif mode == 'train':
if len(args) != 2:
usage(2, 'wrong number of arguments')
train_messages(args[0], args[1])
elif mode == 'score':
if args:
for msg in args:
print_message_score(msg, open(msg))
else:
print_message_score('<stdin>', sys.stdin)
if __name__ == "__main__":
main()
|