/usr/share/pyshared/chemfp/commandline/simsearch.py is in python-chemfp 1.1p1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | from __future__ import with_statement
import math
import sys
import itertools
import time
import chemfp
from chemfp import argparse, io, SOFTWARE, bitops
from chemfp import search
# Suppose you have a 4K fingerprint.
# 1/4096 = 0.000244140625.
# 2/4096 = 0.00048828125
# You only need to show "0.0002" and "0.0005" to
# disambiguate the scores. I don't like seeing only
# the minimum resolution, so I also show at least
# the next bit.
# For 4096 the float_formatter is %.5f and the
# above values are 0.00024 and 0.00049.
# This also prevents the results from being shown
# in scientific notation.
def get_float_formatter(num_bytes):
num_digits = int(math.log10(num_bytes*8)) + 2
float_formatter = "%." + str(num_digits) + "f"
return float_formatter
def write_simsearch_magic(outfile):
outfile.write("#Simsearch/1\n")
def write_count_magic(outfile):
outfile.write("#Count/1\n")
def write_simsearch_header(outfile, d):
lines = []
for name in ("num_bits", "type", "software", "queries", "targets"):
value = d.get(name, None)
if value is not None:
lines.append("#%s=%s\n" % (name, value))
for name in ("query_sources", "target_sources"):
for value in d.get(name, []):
lines.append("#%s=%s\n" % (name, value))
outfile.writelines(lines)
#### The NxM cases
def report_threshold(outfile, query_arenas, targets, threshold):
float_formatter = get_float_formatter(targets.metadata.num_bytes)
def search_function(query_arena):
return targets.threshold_tanimoto_search_arena(query_arena, threshold=threshold)
_report_search(outfile, float_formatter, query_arenas, search_function)
def report_knearest(outfile, query_arenas, targets, k, threshold):
float_formatter = get_float_formatter(targets.metadata.num_bytes)
def search_function(query_arena):
return targets.knearest_tanimoto_search_arena(query_arena, k=k, threshold=threshold)
_report_search(outfile, float_formatter, query_arenas, search_function)
def _report_search(outfile, float_formatter, query_arenas, search_function):
hit_formatter = "\t%s\t" + float_formatter
for query_arena in query_arenas:
for query_id, row in zip(query_arena.ids, search_function(query_arena)):
outfile.write("%d\t%s" % (len(row), query_id))
for hit in row.get_ids_and_scores():
outfile.write(hit_formatter % hit)
outfile.write("\n") # XXX flush?
def report_counts(outfile, query_arenas, targets, threshold):
for query_arena in query_arenas:
counts = targets.count_tanimoto_hits_arena(query_arena, threshold)
for query_id, hit_count in zip(query_arena.ids, counts):
outfile.write("%d\t%s\n" % (hit_count, query_id))
#### The NxN cases
def do_NxN_searches(args, k, threshold, target_filename):
t1 = time.time()
# load_fingerprints sorts the fingerprints based on popcount
# I want the output to be in the same order as the input.
# This means I need to do some reordering. Consider:
# 0003 ID_A
# 010a ID_B
# 1000 ID_C
# I use this to generate:
# original_ids = ["ID_A", "ID_B", "ID_C"]
# targets.ids = [2, 0, 1]
# original_index_to_current_index = {2:0, 0:1, 1:2}
# current_index_to_original_index = {0:2, 1:0, 2:1}
original_ids = []
fps = chemfp.open(target_filename)
def get_index_to_id(fps):
for i, (id, fp) in enumerate(fps):
original_ids.append(id)
yield i, fp
targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets))))
current_index_to_original_id = dict((i, original_ids[original_index])
for i, original_index in enumerate(targets.ids))
t2 = time.time()
outfile = io.open_output(args.output)
with io.ignore_pipe_errors:
type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
k=k, threshold=threshold, max_score=1.0)
if args.count:
type = "Count threshold=%(threshold)s NxN=full" % dict(
threshold=threshold)
write_count_magic(outfile)
else:
write_simsearch_magic(outfile)
write_simsearch_header(outfile, {
"num_bits": targets.metadata.num_bits,
"software": SOFTWARE,
"type": type,
"targets": target_filename,
"target_sources": targets.metadata.sources})
if args.count:
counts = search.count_tanimoto_hits_symmetric(targets, threshold,
batch_size=args.batch_size)
for original_index, original_id in enumerate(original_ids):
current_index = original_index_to_current_index[original_index]
count = counts[current_index]
outfile.write("%d\t%s\n" % (count, original_id))
else:
hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes)
if k == "all":
results = search.threshold_tanimoto_search_symmetric(targets, threshold,
batch_size=args.batch_size)
else:
results = search.knearest_tanimoto_search_symmetric(targets, k, threshold,
batch_size=args.batch_size)
for original_index, original_id in enumerate(original_ids):
current_index = original_index_to_current_index[original_index]
new_indices_and_scores = results[current_index].get_ids_and_scores()
outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id))
for (new_index, score) in new_indices_and_scores:
original_id = original_ids[new_index]
outfile.write(hit_formatter % (original_id, score))
outfile.write("\n") # XXX flush?
t3 = time.time()
if args.times:
sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
####
def int_or_all(s):
if s == "all":
return s
return int(s)
# the "2fps" options need a way to say "get the options from --reference"
# ob2fps --reference targets.fps | simsearch -k 5 --threshold 0.5 targets.fps
parser = argparse.ArgumentParser(
description="Search an FPS file for similar fingerprints")
parser.add_argument("-k" ,"--k-nearest", help="select the k nearest neighbors (use 'all' for all neighbors)",
default=None, type=int_or_all)
parser.add_argument("-t" ,"--threshold", help="minimum similarity score threshold",
default=None, type=float)
parser.add_argument("-q", "--queries", help="filename containing the query fingerprints")
parser.add_argument("--NxN", action="store_true",
help="use the targets as the queries, and exclude the self-similarity term")
parser.add_argument("--hex-query", help="query in hex")
parser.add_argument("--query-id", default="Query1",
help="id for the hex query")
parser.add_argument("--in", metavar="FORMAT", dest="query_format",
help="input query format (default uses the file extension, else 'fps')")
parser.add_argument("-o", "--output", metavar="FILENAME",
help="output filename (default is stdout)")
parser.add_argument("-c", "--count", help="report counts", action="store_true")
parser.add_argument("-b", "--batch-size", help="batch size",
default=100, type=int)
parser.add_argument("--scan", help="scan the file to find matches (low memory overhead)",
action="store_true")
parser.add_argument("--memory", help="build and search an in-memory data structure (faster for multiple queries)",
action="store_true")
parser.add_argument("--times", help="report load and execution times to stderr",
action="store_true")
parser.add_argument("target_filename", nargs=1, help="target filename", default=None)
## Something to enable multi-threading
#parser.add_argument("-j", "--jobs", help="number of jobs ",
# default=10, type=int)
def main(args=None):
args = parser.parse_args(args)
target_filename = args.target_filename[0]
threshold = args.threshold
k = args.k_nearest
if args.count and k is not None and k != "all":
parser.error("--count search does not support --k-nearest")
# People should not use this without setting parameters. On the
# other hand, I don't want an error message if there are no
# parameters. This solution seems to make sense.
if threshold is None:
if k is None:
# If nothing is set, use defaults of --thresdhold 0.7 -k 3
threshold = 0.7
k = 3
else:
# only k is set; search over all possible matches
threshold = 0.0
else:
if k is None:
# only threshold is set; search for all hits above that threshold
k = "all"
if k == "all":
pass
elif k < 0:
parser.error("--k-nearest must be non-negative or 'all'")
if not (0.0 <= threshold <= 1.0):
parser.error("--threshold must be between 0.0 and 1.0, inclusive")
if args.batch_size < 1:
parser.error("--batch-size must be positive")
bitops.use_environment_variables()
if args.NxN:
if args.scan:
parser.error("Cannot specify --scan with an --NxN search")
if args.hex_query:
parser.error("Cannot specify --hex-query with an --NxN search")
if args.queries:
parser.error("Cannot specify --queries with an --NxN search")
do_NxN_searches(args, k, threshold, target_filename)
return
if args.scan and args.memory:
parser.error("Cannot specify both --scan and --memory")
if args.hex_query and args.queries:
parser.error("Cannot specify both --hex-query and --queries")
if args.hex_query:
query_id = args.query_id
for c, name in ( ("\t", "tab"),
("\n", "newline"),
("\r", "control-return"),
("\0", "NUL")):
if c in query_id:
parser.error("--query-id must not contain the %s character" %
(name,))
# Open the target file. This reads just enough to get the header.
try:
targets = chemfp.open(target_filename)
except (IOError, ValueError), err:
sys.stderr.write("Cannot open targets file: %s" % err)
raise SystemExit(1)
if args.hex_query is not None:
try:
query_fp = args.hex_query.decode("hex")
except TypeError, err:
parser.error("--hex-query is not a hex string: %s" % (err,))
for (severity, error, msg_template) in chemfp.check_fp_problems(query_fp, targets.metadata):
if severity == "error":
parser.error(msg_template % dict(fp="query", metadata=repr(target_filename)))
num_bits = targets.metadata.num_bits
if num_bits is None:
num_bits = len(query_fp) * 8
query_metadata = chemfp.Metadata(num_bits=num_bits, num_bytes=len(query_fp))
queries = chemfp.Fingerprints(query_metadata,
[(query_id, query_fp)])
query_filename = None
else:
query_filename = args.queries
try:
queries = chemfp.open(query_filename, format=args.query_format)
except (ValueError, IOError), err:
sys.stderr.write("Cannot open queries file: %s\n" % (err,))
raise SystemExit(1)
batch_size = args.batch_size
query_arena_iter = queries.iter_arenas(batch_size)
t1 = time.time()
first_query_arena = None
for first_query_arena in query_arena_iter:
break
if args.scan:
# Leave the targets as-is
pass
elif args.memory:
targets = chemfp.load_fingerprints(targets)
if not first_query_arena:
# No input. Leave as-is
pass
elif len(first_query_arena) < min(10, batch_size):
# Figure out the optimal search. If there is a
# small number of inputs (< ~10) then a scan
# of the FPS file is faster than an arena search.
pass
else:
targets = chemfp.load_fingerprints(targets)
problems = chemfp.check_metadata_problems(queries.metadata, targets.metadata)
for (severity, error, msg_template) in problems:
msg = msg_template % dict(metadata1="queries", metadata2="targets")
if severity == "error":
parser.error(msg)
elif severity == "warning":
sys.stderr.write("WARNING: " + msg + "\n")
t2 = time.time()
outfile = io.open_output(args.output)
with io.ignore_pipe_errors:
type = "Tanimoto k=%(k)s threshold=%(threshold)s" % dict(
k=k, threshold=threshold, max_score=1.0)
if args.count:
type = "Count threshold=%(threshold)s" % dict(
threshold=threshold)
write_count_magic(outfile)
else:
write_simsearch_magic(outfile)
write_simsearch_header(outfile, {
"num_bits": targets.metadata.num_bits,
"software": SOFTWARE,
"type": type,
"queries": query_filename,
"targets": target_filename,
"query_sources": queries.metadata.sources,
"target_sources": targets.metadata.sources})
if first_query_arena:
query_arenas = itertools.chain([first_query_arena],
query_arena_iter)
if args.count:
report_counts(outfile, query_arenas, targets,
threshold = threshold)
elif k == "all":
report_threshold(outfile, query_arenas, targets,
threshold = threshold)
else:
report_knearest(outfile, query_arenas, targets,
k = k, threshold = threshold)
t3 = time.time()
if args.times:
sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
if __name__ == "__main__":
main()
|