/usr/share/pyshared/hachoir_subfile/search.py is in python-hachoir-subfile 0.5.3-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | from hachoir_core.error import HACHOIR_ERRORS, error
from hachoir_core.stream import InputSubStream
from hachoir_core.tools import humanFilesize, humanDuration
from hachoir_core.memory import limitedMemory
from hachoir_subfile.data_rate import DataRate
from hachoir_subfile.output import Output
from hachoir_subfile.pattern import HachoirPatternMatching as PatternMatching
from sys import stderr
from time import time
def skipSubfile(parser):
subfile = parser.getParserTags().get("subfile", "")
return (subfile == "skip")
FILE_MAX_SIZE = 100 * 1024 * 1024 # Max. file size in bytes (100 MB)
SLICE_SIZE = 64*1024 # Slice size in bytes (64 KB)
MEMORY_LIMIT = 50*1024*1024
PROGRESS_UPDATE = 1.5 # Minimum number of second between two progress messages
class SearchSubfile:
"""
Tool to find file start and file size in any binary stream.
To use it:
- instanciate the class: subfile = SearchSubfile()
- (optional) choose magics with: subfile.loadMagics(categories, parser_ids)
- run the search: subfile.main()
"""
def __init__(self, stream, offset=0, size=None):
"""
Setup search tool, parameter:
- filename: Input filename in locale charset
- directory: Directory filename in locale charset where
output files will be written
- offset: Offset (in bytes) of the beginning of the search
- size: Limit size (in bytes) of input file (None: no limit)
- debug: Debug mode flag (display debug information)
"""
# Size
self.stream = stream
if size is not None:
self.size = min(self.stream.size, (offset+size)*8)
else:
self.size = self.stream.size
# Offset
self.start_offset = offset*8
self.current_offset = self.start_offset
self.slice_size = SLICE_SIZE*8 # 64 KB (in bits)
# Statistics
self.datarate = DataRate(self.start_offset)
self.main_start = time()
# Other flags and attributes
self.patterns = None
self.verbose = True
self.debug = False
self.output = None
self.filter = None
def setOutput(self, directory):
self.output = Output(directory)
def loadParsers(self, categories=None, parser_ids=None):
before = time()
self.patterns = PatternMatching(categories, parser_ids)
if self.debug:
print "Regex compilation: %.1f ms" % ((time() - before)*1000)
print "Use regex: %s" % self.patterns
def main(self):
"""
Run the search.
Return True if ok, False otherwise.
"""
# Initialize
self.mainHeader()
# Prepare search
main_error = False
try:
# Run search
limitedMemory(MEMORY_LIMIT, self.searchSubfiles)
except KeyboardInterrupt:
print >>stderr, "[!] Program interrupted (CTRL+C)"
main_error = True
except MemoryError:
main_error = True
print >>stderr, "[!] Memory error!"
self.mainFooter()
return not(main_error)
def mainHeader(self):
# Fix slice size if needed
self.slice_size = max(self.slice_size, self.patterns.max_length * 8)
# Load parsers if none has been choosen
if not self.patterns:
self.loadParsers()
bytes = (self.size-self.start_offset)//8
print >>stderr, "[+] Start search on %s bytes (%s)" % (
bytes, humanFilesize(bytes))
print >>stderr
self.stats = {}
self.current_offset = self.start_offset
self.main_start = time()
def mainFooter(self):
print >>stderr
print >>stderr, "[+] End of search -- offset=%s (%s)" % (
self.current_offset//8, humanFilesize(self.current_offset//8))
size = (self.current_offset - self.start_offset) // 8
duration = time() - self.main_start
if 0.1 <= duration:
print >>stderr, "Total time: %s -- global rate: %s/sec" % (
humanDuration(duration*1000), humanFilesize(size // duration))
def searchSubfiles(self):
"""
Search all subfiles in the stream, call processParser() for each parser.
"""
self.next_offset = None
self.next_progress = time() + PROGRESS_UPDATE
while self.current_offset < self.size:
self.datarate.update(self.current_offset)
if self.verbose and self.next_progress <= time():
self.displayProgress()
for offset, parser in self.findMagic(self.current_offset):
self.processParser(offset, parser)
self.current_offset += self.slice_size
if self.next_offset:
self.current_offset = max(self.current_offset, self.next_offset)
self.current_offset = min(self.current_offset, self.size)
def processParser(self, offset, parser):
"""
Process a valid parser.
"""
text = "[+] File at %s" % (offset//8)
if parser.content_size is not None:
text += " size=%s (%s)" % (parser.content_size//8, humanFilesize(parser.content_size//8))
if not(parser.content_size) or parser.content_size//8 < FILE_MAX_SIZE:
text += ": " + parser.description
else:
text += ": " + parser.__class__.__name__
if self.output and parser.content_size:
if (offset == 0 and parser.content_size == self.size):
text += " (don't copy whole file)"
elif parser.content_size//8 >= FILE_MAX_SIZE:
text += " (don't copy file, too big)"
elif not self.filter or self.filter(parser):
filename = self.output.createFilename(parser.filename_suffix)
filename = self.output.writeFile(filename, self.stream, offset, parser.content_size)
text += " => %s" % filename
print text
self.next_progress = time() + PROGRESS_UPDATE
def findMagic(self, offset):
"""
Find all 'magic_str' strings in stream in offset interval:
offset..(offset+self.slice_size).
The function returns a generator with values (offset, parser) where
offset is beginning of a file (relative to stream begin), and not the
position of the magic.
"""
start = offset
end = start + self.slice_size
end = min(end, self.size)
data = self.stream.readBytes(start, (end-start)//8)
for parser_cls, offset in self.patterns.search(data):
offset += start
# Skip invalid offset
if offset < 0:
continue
if offset < self.next_offset:
continue
# Create parser at found offset
parser = self.guess(offset, parser_cls)
# Update statistics
if parser_cls not in self.stats:
self.stats[parser_cls] = [0, 0]
self.stats[parser_cls][0] += 1
if not parser:
continue
# Parser is valid, yield it with the offset
self.stats[parser_cls][1] += 1
if self.debug:
print >>stderr, "Found %s at offset %s" % (
parser.__class__.__name__, offset//8)
yield (offset, parser)
# Set next offset
if parser.content_size is not None\
and skipSubfile(parser):
self.next_offset = offset + parser.content_size
if end <= self.next_offset:
break
def guess(self, offset, parser_cls):
"""
Try the specified parser at stream offset 'offset'.
Return the parser object, or None on failure.
"""
substream = InputSubStream(self.stream, offset)
try:
return parser_cls(substream, validate=True)
except HACHOIR_ERRORS:
return None
def displayProgress(self):
"""
Display progress (to stdout) of the whole process.
Compute data rate (in byte per sec) and time estimation.
"""
# Program next update
self.next_progress = time() + PROGRESS_UPDATE
# Progress offset
percent = float(self.current_offset - self.start_offset) * 100 / (self.size - self.start_offset)
offset = self.current_offset // 8
message = "Search: %.2f%% -- offset=%u (%s)" % (
percent, offset, humanFilesize(offset))
# Compute data rate (byte/sec)
average = self.datarate.average
if average:
message += " -- %s/sec " % humanFilesize(average // 8)
eta = float(self.size - self.current_offset) / average
message += " -- ETA: %s" % humanDuration(eta * 1000)
# Display message
print >>stderr, message
|