/usr/share/pyshared/allmydata/scripts/backupdb.py is in tahoe-lafs 1.9.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | # the backupdb is only available if sqlite3 is available. Python-2.5.x and
# beyond include sqlite3 in the standard library. For python-2.4, the
# "pysqlite2" "package" (or "module") (which, despite the confusing name, uses
# sqlite3, and which, confusingly, comes in the "pysqlite" "distribution" (or
# "package")) must be installed. On debian, install python-pysqlite2
import os.path, sys, time, random, stat
from allmydata.util.netstring import netstring
from allmydata.util.hashutil import backupdb_dirhash
from allmydata.util import base32
from allmydata.util.fileutil import abspath_expanduser_unicode
from allmydata.util.encodingutil import to_str
DAY = 24*60*60
MONTH = 30*DAY
SCHEMA_v1 = """
CREATE TABLE version -- added in v1
(
version INTEGER -- contains one row, set to 2
);
CREATE TABLE local_files -- added in v1
(
path VARCHAR(1024) PRIMARY KEY, -- index, this is an absolute UTF-8-encoded local filename
size INTEGER, -- os.stat(fn)[stat.ST_SIZE]
mtime NUMBER, -- os.stat(fn)[stat.ST_MTIME]
ctime NUMBER, -- os.stat(fn)[stat.ST_CTIME]
fileid INTEGER
);
CREATE TABLE caps -- added in v1
(
fileid INTEGER PRIMARY KEY AUTOINCREMENT,
filecap VARCHAR(256) UNIQUE -- URI:CHK:...
);
CREATE TABLE last_upload -- added in v1
(
fileid INTEGER PRIMARY KEY,
last_uploaded TIMESTAMP,
last_checked TIMESTAMP
);
"""
TABLE_DIRECTORY = """
CREATE TABLE directories -- added in v2
(
dirhash varchar(256) PRIMARY KEY, -- base32(dirhash)
dircap varchar(256), -- URI:DIR2-CHK:...
last_uploaded TIMESTAMP,
last_checked TIMESTAMP
);
"""
SCHEMA_v2 = SCHEMA_v1 + TABLE_DIRECTORY
UPDATE_v1_to_v2 = TABLE_DIRECTORY + """
UPDATE version SET version=2;
"""
def get_backupdb(dbfile, stderr=sys.stderr,
create_version=(SCHEMA_v2, 2), just_create=False):
# open or create the given backupdb file. The parent directory must
# exist.
try:
import sqlite3
sqlite = sqlite3 # pyflakes whines about 'import sqlite3 as sqlite' ..
except ImportError:
from pysqlite2 import dbapi2
sqlite = dbapi2 # .. when this clause does it too
# This import should never fail, because setuptools requires that the
# "pysqlite" distribution is present at start time (if on Python < 2.5).
must_create = not os.path.exists(dbfile)
try:
db = sqlite.connect(dbfile)
except (EnvironmentError, sqlite.OperationalError), e:
print >>stderr, "Unable to create/open backupdb file %s: %s" % (dbfile, e)
return None
c = db.cursor()
if must_create:
schema, version = create_version
c.executescript(schema)
c.execute("INSERT INTO version (version) VALUES (?)", (version,))
db.commit()
try:
c.execute("SELECT version FROM version")
version = c.fetchone()[0]
except sqlite.DatabaseError, e:
# this indicates that the file is not a compatible database format.
# Perhaps it was created with an old version, or it might be junk.
print >>stderr, "backupdb file is unusable: %s" % e
return None
if just_create: # for tests
return True
if version == 1:
c.executescript(UPDATE_v1_to_v2)
db.commit()
version = 2
if version == 2:
return BackupDB_v2(sqlite, db)
print >>stderr, "Unable to handle backupdb version %s" % version
return None
class FileResult:
def __init__(self, bdb, filecap, should_check,
path, mtime, ctime, size):
self.bdb = bdb
self.filecap = filecap
self.should_check_p = should_check
self.path = path
self.mtime = mtime
self.ctime = ctime
self.size = size
def was_uploaded(self):
if self.filecap:
return self.filecap
return False
def did_upload(self, filecap):
self.bdb.did_upload_file(filecap, self.path,
self.mtime, self.ctime, self.size)
def should_check(self):
return self.should_check_p
def did_check_healthy(self, results):
self.bdb.did_check_file_healthy(self.filecap, results)
class DirectoryResult:
def __init__(self, bdb, dirhash, dircap, should_check):
self.bdb = bdb
self.dircap = dircap
self.should_check_p = should_check
self.dirhash = dirhash
def was_created(self):
if self.dircap:
return self.dircap
return False
def did_create(self, dircap):
self.bdb.did_create_directory(dircap, self.dirhash)
def should_check(self):
return self.should_check_p
def did_check_healthy(self, results):
self.bdb.did_check_directory_healthy(self.dircap, results)
class BackupDB_v2:
VERSION = 2
NO_CHECK_BEFORE = 1*MONTH
ALWAYS_CHECK_AFTER = 2*MONTH
def __init__(self, sqlite_module, connection):
self.sqlite_module = sqlite_module
self.connection = connection
self.cursor = connection.cursor()
def check_file(self, path, use_timestamps=True):
"""I will tell you if a given local file needs to be uploaded or not,
by looking in a database and seeing if I have a record of this file
having been uploaded earlier.
I return a FileResults object, synchronously. If r.was_uploaded()
returns False, you should upload the file. When you are finished
uploading it, call r.did_upload(filecap), so I can update my
database.
If was_uploaded() returns a filecap, you might be able to avoid an
upload. Call r.should_check(), and if it says False, you can skip the
upload and use the filecap returned by was_uploaded().
If should_check() returns True, you should perform a filecheck on the
filecap returned by was_uploaded(). If the check indicates the file
is healthy, please call r.did_check_healthy(checker_results) so I can
update the database, using the de-JSONized response from the webapi
t=check call for 'checker_results'. If the check indicates the file
is not healthy, please upload the file and call r.did_upload(filecap)
when you're done.
I use_timestamps=True (the default), I will compare ctime and mtime
of the local file against an entry in my database, and consider the
file to be unchanged if ctime, mtime, and filesize are all the same
as the earlier version. If use_timestamps=False, I will not trust the
timestamps, so more files (perhaps all) will be marked as needing
upload. A future version of this database may hash the file to make
equality decisions, in which case use_timestamps=False will not
always imply r.must_upload()==True.
'path' points to a local file on disk, possibly relative to the
current working directory. The database stores absolute pathnames.
"""
path = abspath_expanduser_unicode(path)
s = os.stat(path)
size = s[stat.ST_SIZE]
ctime = s[stat.ST_CTIME]
mtime = s[stat.ST_MTIME]
now = time.time()
c = self.cursor
c.execute("SELECT size,mtime,ctime,fileid"
" FROM local_files"
" WHERE path=?",
(path,))
row = self.cursor.fetchone()
if not row:
return FileResult(self, None, False, path, mtime, ctime, size)
(last_size,last_mtime,last_ctime,last_fileid) = row
c.execute("SELECT caps.filecap, last_upload.last_checked"
" FROM caps,last_upload"
" WHERE caps.fileid=? AND last_upload.fileid=?",
(last_fileid, last_fileid))
row2 = c.fetchone()
if ((last_size != size
or not use_timestamps
or last_mtime != mtime
or last_ctime != ctime) # the file has been changed
or (not row2) # we somehow forgot where we put the file last time
):
c.execute("DELETE FROM local_files WHERE path=?", (path,))
self.connection.commit()
return FileResult(self, None, False, path, mtime, ctime, size)
# at this point, we're allowed to assume the file hasn't been changed
(filecap, last_checked) = row2
age = now - last_checked
probability = ((age - self.NO_CHECK_BEFORE) /
(self.ALWAYS_CHECK_AFTER - self.NO_CHECK_BEFORE))
probability = min(max(probability, 0.0), 1.0)
should_check = bool(random.random() < probability)
return FileResult(self, to_str(filecap), should_check,
path, mtime, ctime, size)
def get_or_allocate_fileid_for_cap(self, filecap):
# find an existing fileid for this filecap, or insert a new one. The
# caller is required to commit() afterwards.
# mysql has "INSERT ... ON DUPLICATE KEY UPDATE", but not sqlite
# sqlite has "INSERT ON CONFLICT REPLACE", but not mysql
# So we use INSERT, ignore any error, then a SELECT
c = self.cursor
try:
c.execute("INSERT INTO caps (filecap) VALUES (?)", (filecap,))
except (self.sqlite_module.IntegrityError, self.sqlite_module.OperationalError):
# sqlite3 on sid gives IntegrityError
# pysqlite2 on dapper gives OperationalError
pass
c.execute("SELECT fileid FROM caps WHERE filecap=?", (filecap,))
foundrow = c.fetchone()
assert foundrow
fileid = foundrow[0]
return fileid
def did_upload_file(self, filecap, path, mtime, ctime, size):
now = time.time()
fileid = self.get_or_allocate_fileid_for_cap(filecap)
try:
self.cursor.execute("INSERT INTO last_upload VALUES (?,?,?)",
(fileid, now, now))
except (self.sqlite_module.IntegrityError, self.sqlite_module.OperationalError):
self.cursor.execute("UPDATE last_upload"
" SET last_uploaded=?, last_checked=?"
" WHERE fileid=?",
(now, now, fileid))
try:
self.cursor.execute("INSERT INTO local_files VALUES (?,?,?,?,?)",
(path, size, mtime, ctime, fileid))
except (self.sqlite_module.IntegrityError, self.sqlite_module.OperationalError):
self.cursor.execute("UPDATE local_files"
" SET size=?, mtime=?, ctime=?, fileid=?"
" WHERE path=?",
(size, mtime, ctime, fileid, path))
self.connection.commit()
def did_check_file_healthy(self, filecap, results):
now = time.time()
fileid = self.get_or_allocate_fileid_for_cap(filecap)
self.cursor.execute("UPDATE last_upload"
" SET last_checked=?"
" WHERE fileid=?",
(now, fileid))
self.connection.commit()
def check_directory(self, contents):
"""I will tell you if a new directory needs to be created for a given
set of directory contents, or if I know of an existing (immutable)
directory that can be used instead.
'contents' should be a dictionary that maps from child name (a single
unicode string) to immutable childcap (filecap or dircap).
I return a DirectoryResult object, synchronously. If r.was_created()
returns False, you should create the directory (with
t=mkdir-immutable). When you are finished, call r.did_create(dircap)
so I can update my database.
If was_created() returns a dircap, you might be able to avoid the
mkdir. Call r.should_check(), and if it says False, you can skip the
mkdir and use the dircap returned by was_created().
If should_check() returns True, you should perform a check operation
on the dircap returned by was_created(). If the check indicates the
directory is healthy, please call
r.did_check_healthy(checker_results) so I can update the database,
using the de-JSONized response from the webapi t=check call for
'checker_results'. If the check indicates the directory is not
healthy, please repair or re-create the directory and call
r.did_create(dircap) when you're done.
"""
now = time.time()
entries = []
for name in contents:
entries.append( [name.encode("utf-8"), contents[name]] )
entries.sort()
data = "".join([netstring(name_utf8)+netstring(cap)
for (name_utf8,cap) in entries])
dirhash = backupdb_dirhash(data)
dirhash_s = base32.b2a(dirhash)
c = self.cursor
c.execute("SELECT dircap, last_checked"
" FROM directories WHERE dirhash=?", (dirhash_s,))
row = c.fetchone()
if not row:
return DirectoryResult(self, dirhash_s, None, False)
(dircap, last_checked) = row
age = now - last_checked
probability = ((age - self.NO_CHECK_BEFORE) /
(self.ALWAYS_CHECK_AFTER - self.NO_CHECK_BEFORE))
probability = min(max(probability, 0.0), 1.0)
should_check = bool(random.random() < probability)
return DirectoryResult(self, dirhash_s, to_str(dircap), should_check)
def did_create_directory(self, dircap, dirhash):
now = time.time()
# if the dirhash is already present (i.e. we've re-uploaded an
# existing directory, possibly replacing the dircap with a new one),
# update the record in place. Otherwise create a new record.)
self.cursor.execute("REPLACE INTO directories VALUES (?,?,?,?)",
(dirhash, dircap, now, now))
self.connection.commit()
def did_check_directory_healthy(self, dircap, results):
now = time.time()
self.cursor.execute("UPDATE directories"
" SET last_checked=?"
" WHERE dircap=?",
(now, dircap))
self.connection.commit()
|