This file is indexed.

/usr/bin/sb_mailsort is in spambayes 1.1b1-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#! /usr/bin/python
"""\
To train:
    %(program)s -t ham.mbox spam.mbox

To filter mail (using .forward or .qmail):
    |%(program)s Maildir/ Mail/Spam/

To print the score and top evidence for a message or messages:
    %(program)s -s message [message ...]
"""

SPAM_CUTOFF = 0.57

SIZE_LIMIT = 5000000 # messages larger are not analyzed
BLOCK_SIZE = 10000
RC_DIR = "~/.spambayes"
DB_FILE = RC_DIR + "/wordprobs.cdb"
CONFIG_FILE = RC_DIR + "/bayescustomize.ini"

import sys
import os
import getopt
import email
import time
import signal
import socket
import errno

DB_FILE = os.path.expanduser(DB_FILE)

def import_spambayes():
    global mboxutils, CdbClassifier, tokenize
    if not os.environ.has_key('BAYESCUSTOMIZE'):
        os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE)
    from spambayes import mboxutils
    from spambayes.cdb_classifier import CdbClassifier
    from spambayes.tokenizer import tokenize


program = sys.argv[0] # For usage(); referenced by docstring above

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def maketmp(dir):
    hostname = socket.gethostname()
    pid = os.getpid()
    fd = -1
    for x in xrange(200):
        filename = "%d.%d.%s" % (time.time(), pid, hostname)
        pathname = "%s/tmp/%s" % (dir, filename)
        try:
            fd = os.open(pathname, os.O_WRONLY|os.O_CREAT|os.O_EXCL, 0600)
        except IOError, exc:
            if exc[0] not in (errno.EINT, errno.EEXIST):
                raise
        else:
            break
        time.sleep(2)
    if fd == -1:
        raise SystemExit, "could not create a mail file"
    return (os.fdopen(fd, "wb"), pathname, filename)

def train(bayes, msgs, is_spam):
    """Train bayes with all messages from a mailbox."""
    mbox = mboxutils.getmbox(msgs)
    for msg in mbox:
        bayes.learn(tokenize(msg), is_spam)

def train_messages(ham_name, spam_name):
    """Create database using messages."""

    rc_dir = os.path.expanduser(RC_DIR)
    if not os.path.exists(rc_dir):
        print "Creating", RC_DIR, "directory..."
        os.mkdir(rc_dir)
    bayes = CdbClassifier()
    print 'Training with ham...'
    train(bayes, ham_name, False)
    print 'Training with spam...'
    train(bayes, spam_name, True)
    print 'Update probabilities and writing DB...'
    db = open(DB_FILE, "wb")
    bayes.save_wordinfo(db)
    db.close()
    print 'done'

def filter_message(hamdir, spamdir):
    signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1))
    signal.alarm(24 * 60 * 60)

    # write message to temporary file (must be on same partition)
    tmpfile, pathname, filename = maketmp(hamdir)
    try:
        tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line
        bytes = 0
        blocks = []
        while 1:
            block = sys.stdin.read(BLOCK_SIZE)
            if not block:
                break
            bytes += len(block)
            if bytes < SIZE_LIMIT:
                blocks.append(block)
            tmpfile.write(block)
        tmpfile.close()

        if bytes < SIZE_LIMIT:
            msgdata = ''.join(blocks)
            del blocks
            msg = email.message_from_string(msgdata)
            del msgdata
            bayes = CdbClassifier(open(DB_FILE, 'rb'))
            prob = bayes.spamprob(tokenize(msg))
        else:
            prob = 0.0

        if prob > SPAM_CUTOFF:
            os.rename(pathname, "%s/new/%s" % (spamdir, filename))
        else:
            os.rename(pathname, "%s/new/%s" % (hamdir, filename))
    except:
        os.unlink(pathname)
        raise

def print_message_score(msg_name, msg_fp):
    msg = email.message_from_file(msg_fp)
    bayes = CdbClassifier(open(DB_FILE, 'rb'))
    prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
    print msg_name, prob
    for word, prob in evidence:
        print '  ', repr(word), prob

def main():
    global DB_FILE, CONFIG_FILE

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'tsd:c:')
    except getopt.error, msg:
        usage(2, msg)

    mode = 'sort'
    for opt, val in opts:
        if opt == '-t':
            mode = 'train'
        elif opt == '-s':
            mode = 'score'
        elif opt == '-d':
            DB_FILE = val
        elif opt == '-c':
            CONFIG_FILE = val
        else:
            assert 0, 'invalid option'

    import_spambayes()

    if mode == 'sort':
        if len(args) != 2:
            usage(2, 'wrong number of arguments')
        filter_message(args[0], args[1])
    elif mode == 'train':
        if len(args) != 2:
            usage(2, 'wrong number of arguments')
        train_messages(args[0], args[1])
    elif mode == 'score':
        if args:
            for msg in args:
                print_message_score(msg, open(msg))
        else:
            print_message_score('<stdin>', sys.stdin)


if __name__ == "__main__":
    main()