/usr/bin/dnaclust-ref is in dnaclust 3-5.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | #!/bin/bash
similarity=0.98
threads=1
print_help()
{
fold -w 120 <<EOF
Usage: dnaclust-ref [OPTIONS...]
DNACLUST helper script to cluster sequences using a reference database.
-c CENTERS Fasta file of cluster centers/references.
-d After clustering with reference database, perform de novo clustering.
-r SIMILARITY Set similarity between cluster center and cluster
sequences (default=0.98)
-t THREADS Set the number of threads to use
-i INPUT_FILE Fasta file of sequences to be clustered.
-v Print verbose messages to standard error
-h Give this help list
The sequences to be clustered are read from the STDIN. The cluster centers are written to STDOUT. Messages are written to STDERR.
EOF
}
while getopts "c:i:dr:t:vhln" option
do
case $option in
c) cluster_centers="$OPTARG";;
i) input="$OPTARG";;
d) de_novo_cluster=0;;
r) similarity="$OPTARG";;
t) threads="$OPTARG";;
v) verbose=0;;
l) left_gaps_allowed=0;;
n) no_overlap=0;;
h) print_help; exit 0;;
[?]) print_help; exit 1;;
esac
done
print_message()
{
if [ $verbose ]
then
echo "`date +%T` $1" >&2
fi
}
parameters=""
if [ $left_gaps_allowed ]
then
parameters+=" --left-gaps-allowed "
fi
if [ $no_overlap ]
then
parameters+=" --no-overlap "
fi
dnaclust_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
print_message "$dnaclust_path"
#exit 1
UNAME=$( uname )
tempdir=""
if [ $UNAME = "Darwin" ];
then
tempdir=`mktemp -d -t .`
else
tempdir=`mktemp -d -p .`
fi
#tempdir="tmpref/"
trap "rm -fr $tempdir" EXIT
#sequences_sorted=`mktemp -p $tempdir`
db_sorted=`basename ${cluster_centers} .fasta`.sorted.fasta
# Reads the sequences from STDIN.
print_message "Reading and sorting the database sequences: $tempdir/${db_sorted}"
#"/usr/lib/dnaclust/fastasort" --random-shuffle > $sequences_sorted
cat $cluster_centers | "/usr/lib/dnaclust/fastasort" > $tempdir/${db_sorted}
print_message "Recruiting from the sequences, using database."
"$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $input -p $tempdir/${db_sorted} -r | awk '{if (NF > 1) print $0}' > $input.db.clusters
print_message "DB recruited sequences: $input.db.clusters"
if [ $de_novo_cluster ]
then
print_message "Run DNACLUST on remaining sequences."
unclustered_seq=`basename $input .fasta`.unclustered.fasta
awk '{ for (i = 1; i <= NF; i++) print $i}' $input.db.clusters > $tempdir/clustered_seqs
"/usr/lib/dnaclust/fastaselect" --everything-except -f ${input} < $tempdir/clustered_seqs > $tempdir/${unclustered_seq}
if [[ -s $tempdir/${unclustered_seq} ]] ; then
"$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $tempdir/${unclustered_seq} > $input.denovo.clusters
print_message "Writing de novo clusters to: $input.denovo.clusters"
else
touch $input.denovo.clusters
fi
fi
exit 1
|