/usr/share/librg-utils-perl/dbSwiss is in librg-utils-perl 1.0.43-3.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | #!/usr/bin/perl -w
use strict;
use warnings;
use Carp qw| cluck :DEFAULT |;
use Getopt::Long qw(:config gnu_getopt auto_version auto_help);
use Pod::Usage;
use DBI;
#use File::Temp qw||;
use File::Path qw||;
use IO::File;
BEGIN { our $VERSION = "1.0"; }
$ENV{'PATH'} = '/usr/bin:/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
our $opt;
my $Lok = GetOptions( $opt = { datadir => '/mnt/project/rost_db/data/swissprot', 'table' => 'dbswiss', infile => '/mnt/project/rost_db/data/swissprot/uniprot_sprot.dat' },
'debug!',
'datadir|d=s',
'first20!',
'infile|i=s',
'man!',
'quiet!',
'readback!',
'table=s',
'workdir|w=s'
);
if( !$Lok ){ die("Invalid arguments, please check man page.\n"); }
our $dbg = $opt->{debug};
our $quiet = $opt->{quiet};
if( $opt->{man} ){ pod2usage(-verbose => 2); }
if( $opt->{table} =~ /[^a-z0-9_]/io ){ confess("invalid table name '$opt->{table}'"); }
# lkajan: the below would be nice but perhaps because of the ACLs I get "Error in tempdir() using /mnt/project/rost_db/src/XXXXXXXXXX: Parent directory (/mnt/project/rost_db/src) is not writable" when it is in fact writable
#my $workdir = $opt->{workdir} || File::Temp::tempdir( DIR => $opt->{datadir}, CLEANUP => !$dbg );
my $CLEANUP = 0;
my $workdir = $opt->{workdir};
if( !$workdir ){ $CLEANUP = !$dbg; $workdir = `mktemp -d '$opt->{datadir}/XXXXXXXXXX'`; chomp( $workdir ); }
END { if( $CLEANUP ){ File::Path::rmtree( $workdir ); } }
#my $dbh = DBI->connect('DBI:DBM(RaiseError=1):type=GDBM_File;ext=;') || confess( $DBI::errstr );
my $dbh = DBI->connect('DBI:DBM(RaiseError=1):type=DB_File;ext=;') || confess( $DBI::errstr );
$dbh->{f_dir} = $workdir;
if( $dbg ){ warn( "workdir = $workdir" ); }
my $infh = IO::File->new( $opt->{infile}, 'r' ) || confess( "failed to open '$opt->{infile}': $!" );
$dbh->do( "drop table if exists $opt->{table}");
$dbh->do( "create table $opt->{table} ( id text, bytepos int )" ); # the value is the byte position where the record begins in the flat file
my $count = 0;
my $bytetop = 0;
my $t0 = time();
{
local $/ = "//\n";
while( my $rec = <$infh> )
{
#ID 002R_IIV3 Reviewed; 458 AA.
#ID C108B_XENLA Reviewed; 183 AA.
if( $rec !~ /^ID\s+(\w+)/o ){ confess("unrecognized record $count '$rec'"); }
my $id = lc($1);
$dbh->do("insert into $opt->{table} values ( ?, ? )", undef, $id, $bytetop );
if( $opt->{readback} )
{
my $rec = $dbh->selectrow_arrayref("select * from $opt->{table} where id = ?", undef, $id );
print STDERR ">>>$rec->[0] : $rec->[1]<<<\n";
}
#
++$count;
$bytetop = tell( $infh );
if( $t0 != time() ){ $t0 = time(); _printcount( $count ); }
if( $opt->{first20} && $count > 20 ){ last; }
}
_printcount( $count ); if( !$main::quiet ){ print STDERR "\n"; }
}
$dbh->disconnect();
# we are done, move the db files to their final destination
my @resfiles = glob( "$workdir/*" );
system( 'mv', "-t", "$opt->{datadir}", @resfiles ) && confess("failed to mv @resfiles to $opt->{datadir}");
exit(0);
sub _printcount
{
my( $__count ) = @_;
if( !$main::quiet ){ print STDERR `tput hpa 0`, `tput el`, $count; }
}
=pod
=head1 NAME
dbSwiss - create DBM version of Swiss-Prot data
=head1 SYNOPSIS
__pkgdatadir__/dbSwiss [OPTIONS]
__pkgdatadir__/dbSwiss --datadir /data/swissprot --infile /data/swissprot/uniprot_sprot.dat
__pkgdatadir__/dbSwiss [--help] [--man]
=head1 DESCRIPTION
dbSwiss creates DBM version of Swiss-Prot data. This procedure is to replace splitSwiss.pl. splitSwiss.pl saves Swiss-Prot records in separate files resulting in over 13 million relatively tiny files that take very long to create and rsync. dbSwiss instead saves each record into a DBM database that is optimized for fast retrieval.
=head1 OPTIONS
=over
=item -d, --datadir=I<path>
directory of database files, default: '/mnt/project/rost_db/data/swissprot'
=item --debug
=item --nodebug
=item --first20
=item --nofirst20
process only first 20 records, for debugging
=item --help
=item -i, --infile=I<path>
Swiss-Prot data flatfile, default: '/mnt/project/rost_db/data/swissprot/uniprot_sprot.dat'.
=item --man
=item --quiet
=item --noquiet
do not print progress status
=item --readback
=item --noreadback
read records back after storing and print them
=item --table
name of database table and consequently the base name of database files, default: 'dbswiss'
=item --version
=item -w, --workdir=I<path>
Optional working directory. Automatically created and removed if not defined.
=back
=head1 AUTHOR
Laszlo Kajan <lkajan@rostlab.org>
=cut
# vim:et:ts=2:ai:
|