/usr/share/pyshared/chemfp/commandline/sdf2fps.py is in python-chemfp 1.1p1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | from __future__ import absolute_import
import sys
import re
import itertools
from .. import Metadata, FingerprintIterator, ParseError
from .. import argparse
from .. import encodings
from .. import sdf_reader
from .. import io
from .. import error_handlers
from . import cmdsupport
# Backwards compatibility support for Python 2.5
try:
next
except NameError:
def next(it):
return it.next()
def _check_num_bits(num_bits, # from the user
fp_num_bits, # not None if the fp decoder know it exactly
num_bytes, # length of decoded fp in bytes
parser):
"""Check that the number of fingerprint bits and bytes match the user input
Difficulties: some fingerprints have only a byte length, and the user
doesn't have to specify the input.
Returns the number of bits, or calls parser.error if there are problems
"""
if fp_num_bits is not None:
# The fingerprint knows exactly how many bits it contains
if num_bits is None:
# The user hasn't specified, so go with the exact number
return fp_num_bits
# If the user gave a value, make sure it matches
if num_bits != fp_num_bits:
parser.error(
("the first fingerprint has %(fp_num_bits)s bits which "
"is not the same as the --num-bits value of %(num_bits)s") % dict(
num_bits=num_bits, fp_num_bits=fp_num_bits))
raise AssertionError("should not get here")
return num_bits
# If the number of bits isn't specified, assume it's exactly
# enough to fill up the fingerprint bytes.
if num_bits is None:
return num_bytes * 8
# The user specified the number of bits. The first fingerprint
# has a number of bytes. This must be enough to hold the bits,
# but only up to 7 bits larger.
if (num_bits+7)//8 != num_bytes:
parser.error(
("The byte length of the first fingerprint is %(num_bytes)s so --num-bits "
"must be %(min)s <= num-bits <= %(max)s, not %(num_bits)s") % dict(
num_bytes=num_bytes, min=num_bytes*8-7, max=num_bytes*8,
num_bits=num_bits))
raise AssertError("should not get here")
# Accept what the user suggested
return num_bits
parser = argparse.ArgumentParser(
description="Extract a fingerprint tag from an SD file and generate FPS fingerprints",
#epilog=epilog,
#formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"filenames", nargs="*", help="input SD files (default is stdin)", default=None)
parser.add_argument("--id-tag", metavar="TAG", default=None,
help="get the record id from TAG instead of the first line of the record")
parser.add_argument("--fp-tag", metavar="TAG",
help="get the fingerprint from tag TAG (required)")
parser.add_argument("--num-bits", metavar="INT", type=int,
help="use the first INT bits of the input. Use only when the "
"last 1-7 bits of the last byte are not part of the fingerprint. "
"Unexpected errors will occur if these bits are not all zero.")
parser.add_argument(
"--errors", choices=["strict", "report", "ignore"], default="strict",
help="how should structure parse errors be handled? (default=strict)")
parser.add_argument("-o", "--output", metavar="FILENAME",
help="save the fingerprints to FILENAME (default=stdout)")
parser.add_argument("--software", metavar="TEXT",
help="use TEXT as the software description")
parser.add_argument("--type", metavar="TEXT",
help="use TEXT as the fingerprint type description")
# TODO:
# Do I want "--gzip", "--auto", "--none", "--bzip2", and "--decompress METHOD"?
# Do I want to support encoding of the fps output?
# Or, why support all these? Why not just "--in gz", "--in bz2" and be done
# with it (do I really need to specify the 'auto' and 'none' options?)
parser.add_argument(
"--decompress", action="store", metavar="METHOD", default="auto",
help="use METHOD to decompress the input (default='auto', 'none', 'gzip', 'bzip2')")
#parser.add_argument(
# "--compress", action="store", metavar="METHOD", default="auto",
# help="use METHOD to compress the output (default='auto', 'none', 'gzip', 'bzip2')")
# This adds --cactvs, --base64 and other decoders to the command-line arguments
encodings._add_decoding_group(parser)
# Support the "--pubchem" option
shortcuts_group = parser.add_argument_group("shortcuts")
class AddSubsKeys(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
namespace.cactvs=True
# the 1.3 is solely based on the version of the document at
# ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
namespace.software="CACTVS/unknown"
namespace.type="CACTVS-E_SCREEN/1.0 extended=2"
namespace.fp_tag="PUBCHEM_CACTVS_SUBSKEYS"
shortcuts_group.add_argument("--pubchem", nargs=0, action=AddSubsKeys,
help = ("decode CACTVS substructure keys used in PubChem. Same as "
"--software=CACTVS/unknown --type 'CACTVS-E_SCREEN/1.0 extended=2' "
"--fp-tag=PUBCHEM_CACTVS_SUBSKEYS --cactvs"))
###############
_illegal_value_pat = re.compile(r"[\000-\037]")
def main(args=None):
args = parser.parse_args(args)
if not args.fp_tag:
parser.error("argument --fp-tag is required")
if args.num_bits is not None and args.num_bits <= 0:
parser.error("--num-bits must be a positive integer")
fp_decoder_name, fp_decoder = encodings._extract_decoder(parser, args)
missing = cmdsupport.check_filenames(args.filenames)
if missing:
parser.error("Structure file %r does not exist" % (missing,))
for attr in ("software", "type"):
description = getattr(args, attr, None)
if description is None:
continue
m = _illegal_value_pat.search(description)
if m is None:
continue
parser.error("--%(attr)s description may not contain the character %(c)r" % dict(
attr=attr, c = m.group(0)))
error_handler = error_handlers.get_parse_error_handler(args.errors)
# What follows is a bit tricky. I set up a chain of iterators:
# - iterate through the SDF iterators
# - iterate through the (id, encoded_fp) pairs in each SDF iterator
# - convert to (id, fp, num_bits) 3-element tuples
# - use the first element to figure out the right metadata
# - send to (id, fp) information to the io.write_fps1_output function
# Iterate through each of the filenames, yielding the corresponding SDF iterator
location = sdf_reader.FileLocation()
def get_sdf_iters():
if not args.filenames:
yield sdf_reader.open_sdf(None, args.decompress, location=location)
else:
for filename in args.filenames:
location.filename = filename
location.lineno = 1
yield sdf_reader.open_sdf(filename, args.decompress, location=location)
# Set up the error messages for missing id or fingerprints.
if args.id_tag is None:
MISSING_ID = "Missing title in the record starting %(where)s"
MISSING_FP = "Missing fingerprint tag %(tag)r in record starting %(where)s"
else:
MISSING_ID = "Missing id tag %(tag)r in the record starting %(where)s"
MISSING_FP = "Missing fingerprint tag %(tag)r in record %(id)r starting %(where)s"
# For each SDF iterator, yield the (id, encoded_fp) pairs
if args.id_tag is None:
def iter_encoded_fingerprints(sdf_iters):
counter = itertools.count(1)
for sdf_iter in sdf_iters:
for id, fp in sdf_reader.iter_title_and_tag(sdf_iter, args.fp_tag):
if id:
id = io.remove_special_characters_from_id(id)
yield id, fp
else:
def iter_encoded_fingerprints(sdf_iters):
counter = itertools.count(1)
for sdf_iter in sdf_iters:
for id, fp in sdf_reader.iter_two_tags(sdf_iter, args.id_tag, args.fp_tag):
if id:
id = io.remove_special_characters_from_id(id)
yield id, fp
# This is either None or a user-specified integer
num_bits = args.num_bits
# At this point I don't have enough information to generate the metadata.
# I won't get that until I've read the first record.
outfile = None # Don't open it until I'm ready to write the first record
num_bytes = None # Will need to get (or at least check) the fingerprint byte length
# Decoded encoded fingerprints, yielding (id, fp, num_bits)
def decode_fingerprints(encoded_fp_reader, error_handler):
expected_num_bits = -1
expected_fp_size = None
for id, encoded_fp in encoded_fp_reader:
if not id:
msg = MISSING_ID % dict(id=id, where=location.where(),
tag=args.id_tag)
error_handler(msg)
continue
if not encoded_fp:
msg = MISSING_FP % dict(id=id, where=location.where(),
tag=args.fp_tag)
error_handler(msg)
continue
# Decode the fingerprint, and complain if it isn't decodeable.
try:
num_bits, fp = fp_decoder(encoded_fp)
except ValueError, err:
msg = ("Could not %(decoder_name)s decode %(tag)r value %(encoded_fp)r: %(err)s %(where)s" %
dict(decoder_name=fp_decoder_name, tag=args.fp_tag,
where=location.where(), err=err, encoded_fp=encoded_fp))
error_handler(msg)
continue
if num_bits != expected_num_bits:
if expected_num_bits == -1:
expected_num_bits = num_bits
else:
msg = ("Tag %(tag)r value %(encoded_fp)r has %(got)d bits but expected %(expected)d %(where)s" %
dict(tag=args.fp_tag, encoded_fp=encoded_fp,
got=num_bits, expected=expected_num_bits,
where=location.where()))
error_handler(msg)
continue
if len(fp) != expected_fp_size:
if expected_fp_size is None:
expected_fp_size = len(fp)
else:
msg = ("Tag %(tag)r value %(encoded_fp)r has %(got)d bytes but expected %(expected)d %(where)s" %
dict(tag=args.fp_tag, encoded_fp=encoded_fp,
got=len(fp), expected=expected_fp_size,
where=location.where()))
error_handler(msg)
continue
yield id, fp, num_bits
sdf_iters = get_sdf_iters()
encoded_fps = iter_encoded_fingerprints(sdf_iters)
decoded_fps = decode_fingerprints(encoded_fps, error_handler)
try:
id, fp, num_bits = next(decoded_fps)
except ParseError, err:
sys.stderr.write("ERROR: %s. Exiting." % (err,))
raise SystemExit(1)
except StopIteration:
# No fingerprints? Make a new empty stream
metadata = Metadata(date = io.utcnow())
chained_reader = iter([])
else:
# Got the first fingerprint
expected_num_bytes = len(fp)
# Verify that they match
expected_num_bits = _check_num_bits(args.num_bits, num_bits, expected_num_bytes, parser)
chained_reader = itertools.chain( [(id, fp)], (x[:2] for x in decoded_fps) )
metadata = Metadata(num_bits = expected_num_bits,
software = args.software,
type = args.type,
sources = args.filenames,
date = io.utcnow())
try:
io.write_fps1_output(chained_reader, args.output, metadata)
except ParseError, err:
sys.stderr.write("ERROR: %s. Exiting." % (err,))
raise SystemExit(1)
if __name__ == "__main__":
main()
|