/usr/bin/maf-sort is in last-align 490-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | #! /bin/sh
# Sort MAF-format alignments by sequence name, then strand, then start
# position, then end position, of the top sequence. Also, merge
# identical alignments. Comment lines starting with "#" are written
# at the top, in unchanged order. If option "-d" is specified, then
# alignments that appear only once are omitted (like uniq -d).
# Minor flaws, that do not matter for typical MAF input:
# 1) It might not work if the input includes TABs.
# 2) Preceding whitespace is considered part of the sequence name. I
# want to use sort -b, but it seems to be broken in different ways for
# different versions of sort!
# 3) Alignments with differences in whitespace are considered
# non-identical.
# This script uses perl instead of specialized commands like uniq.
# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
# work with long lines.
# Make "sort" use a standard ordering:
LC_ALL=C
export LC_ALL
uniqOpt=1
whichSequence=1
while getopts hdn: opt
do
case $opt in
h) cat <<EOF
Usage: $(basename $0) [options] my-alignments.maf
Options:
-h show this help message and exit
-d only print duplicate alignments
-n sort by the n-th sequence (default: 1)
EOF
exit
;;
d) uniqOpt=2
;;
n) whichSequence="$OPTARG"
;;
esac
done
shift $((OPTIND - 1))
baseField=$((6 * $whichSequence))
a=$(($baseField - 4))
a=$a,$a
b=$(($baseField - 1))
b=$b,$b
c=$(($baseField - 3))
c=$c,$c
d=$(($baseField - 2))
d=$d,$d
# 1) Add digits to "#" lines, so that sorting won't change their order.
# 2) Replace spaces, except in "s" lines.
# 3) Join each alignment into one big line.
perl -pe '
s/^#/sprintf("#%.9d",$c++)/e;
y/ /\a/ unless /^s/;
y/\n/\b/ if /^\w/;
' "$@" |
sort -k$a -k$b -k${c}n -k${d}n | # sort the lines
# Print only the first (or second) of each run of identical lines:
perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
# 1) Remove the digits from "#" lines.
# 2) Restore spaces and newlines.
perl -pe '
s/^#.{9}/#/;
y/\a\b/ \n/;
'
|