#!/usr/local/bin/perl -w use Getopt::Long; # (c) Copyright 2002, 2003, 2004, 2005, 2006, 2007, 2008 Stijn van Dongen # (c) Copyright 2004 Jason Stajich (column output parsing) # # Other (C)ontributors; # Dinakarpandian Deendayal (general comments) # Abel Ureta-Vidal (general comments) # Daniel Lundin (regexp tweak, warning) # # You can redistribute and/or modify this program under the terms of the GNU # General Public License; either version 3 of the License or (at your option) # any later version. # # TODO # # - Optionally read from STDIN # - Require tab file for line-mode={123,packed} # # - Infer hsp_len from long output (?) # - (Perhaps) enable section parsing ala tribe, for finer control. # - $:: usage is arbitrary and ugly. pipe down. # - Is the m9/seenlfgt combination correct? # - Check whether m9 output state is correct if errors occur. $^W = 1; use strict; $::mode_sort = 'o'; # a lphabetical # o ccurrence $::mode_score = 'e'; # e value # b it my $line_mode = ""; # default is raw mode, not line mode $::name_out = '-'; $::bcut = 0; $::ecut = 0; $::rcut = 0; my $blastfix = ""; my $addfix = ""; my $user_tabfile = ""; my %line_modes = qw ( abc 1 123 1); my $obase = ""; my $stdhandler = 0; my $help = 0; my $m9 = 0; my $abc = 0; if (! GetOptions ( "help" => \$help , "apropos" => \$help , "sort=s" => \$::mode_sort , "ecut=f" => \$::ecut , "bcut=f" => \$::bcut , "rcut=f" => \$::rcut , "score=s" => \$::mode_score , "m9" => \$m9 , "tab=s" => \$user_tabfile , "xo-dat=s" => \$addfix , "xi-dat=s" => \$blastfix , "base=s" => \$obase , "stdhandler" => \$stdhandler , "line-mode=s" => \$line_mode , "out=s" => \$::name_out ) ) { print STDERR "option processing failed\n"; exit(1); } help() && exit(0) if $help; die "unknown sort mode <$::mode_sort>" unless $::mode_sort =~ /^[ao]$/; die "unknown line mode <$line_mode>" if ($line_mode && !defined($line_modes{$line_mode})); die "unknown sort mode <$::mode_score>" unless $::mode_score =~ /^[ebr]$/; my $fname = shift || die "please submit name of blast file\n"; $obase = $fname unless $obase; if ($blastfix) { $obase = $fname; if ($fname =~ /\Q.$blastfix\E$/) { $obase =~ s/\Q.$blastfix\E$//; } else { $fname .= ".$blastfix"; } } if ($addfix) { $obase .= ".$addfix"; } my ($gix, $giy); $::seenlft = {}; $::seenrgt = {}; my $tagTocc = {}; my $me = "[$0] "; my $lc = 0; my $fhin = \*STDIN; if ($fname ne '-') { open(F_BLAST, "<$fname") || die "cannot open $fname\n"; $fhin = \*F_BLAST; } $::TAB_user = {}; if ($user_tabfile) { read_tab($user_tabfile, $::TAB_user); } $::f_raw = undef; my $f_err = undef; if ($line_mode) { if (!$::name_out || $::name_out eq '-') { $::f_raw = \*STDOUT; } else { open(F_RAW, ">$::name_out") || die "cannot open $::name_out"; $::f_raw = \*F_RAW; } $f_err = \*STDERR; if ($line_mode eq 'abc') { # print $::f_raw "#aa# (stream abc cookie)\n"; } elsif ($line_mode eq '123') { # print $::f_raw "#11# (stream abc cookie)\n"; } } else { if ($stdhandler) { $::f_raw = \*STDOUT; $f_err = \*STDERR; } else { open(F_RAW, ">$obase.raw") || die "cannot open $obase.raw\n"; $::f_raw = \*F_RAW; open(F_ERR, ">$obase.err") || die "cannot open $obase.err\n"; $f_err = \*F_ERR; } } $::ID = 0; if ($m9) { munge_linewise($fhin, $::f_raw, $line_mode); } else { munge_long($fhin, $::f_raw, $line_mode); } close $::f_raw; $::CT = scalar keys %$::seenrgt; my $alnum = 0; my $occTmisc = {}; if (!$user_tabfile && !$line_mode) { open(F_HDR, ">$obase.hdr") || die "cannot open $obase.hdr\n"; # header open(F_MAP, ">$obase.map") || die "cannot open $obase.idx\n"; # map file open(F_TAB, ">$obase.tab") || die "cannot open $obase.idx\n"; # indices print F_TAB "# \n"; if ($::mode_sort eq 'o') { print F_TAB "# sort mode is by occurrence\n"; } elsif ($::mode_sort eq 'a') { print F_TAB "# sort mode is alphabetical\n"; } if ($::mode_sort eq 'a') { for (sort {$::a cmp $::b; } keys %$tagTocc) { print F_TAB "$alnum $_\n"; $occTmisc->{$tagTocc->{$_}} = [ $alnum, $_ ]; $alnum++; } print STDERR "Index [$obase.tab] is sorted by alphabetic order\n"; } elsif ($::mode_sort eq 'o' || 1) { for (sort {$tagTocc->{$::a} <=> $tagTocc->{$::b}; } keys %$tagTocc) { print F_TAB "$alnum $_\n"; $occTmisc->{$tagTocc->{$_}} = [ $alnum, $_ ]; $alnum++; } print STDERR "Index [$obase.tab] is sorted by occurrence order\n"; print STDERR "Primary and secondary occurrences are considered equal\n"; } my $ct = keys %$occTmisc; print F_MAP "(mclheader\nmcltype matrix\ndimensions $ct", 'x', "$ct\n)\n(mclmatrix\nbegin\n"; for (sort {$::a <=> $::b; } keys %$occTmisc) { # print F_MAP "$_ ", $occTmisc->{$_}[0], " ", $occTmisc->{$_}[1], "\n"; print F_MAP "$_ $occTmisc->{$_}[0] \$\n"; } print F_MAP ")\n"; print F_HDR "(mclheader\nmcltype matrix\ndimensions "; print F_HDR $::ID . 'x' . $::ID; print F_HDR "\n)\n"; close F_TAB; close F_HDR; close F_MAP; } my $n_err = 0; for (sort keys %$::seenrgt) { if (!$::seenlft->{$_}) { print $f_err "secondary element $_ not seen as primary element\n"; print $f_err "emergency measure: added the element to the primary list\n"; $n_err++; } } if ($n_err) { print STDERR $me, "$n_err secondary elements not seen as primary element\n"; print STDERR $me, "I added all of them\n"; print STDERR $me, "There were $::CT elements in all\n"; } else { print STDERR $me, "all secondary elements were also seen as primary elements (check ok)\n"; } sub munge_linewise { my ($fh_in, $fh_raw, $line_mode) = @_; my $gix_prev = ""; while (<$fh_in>) { next if /^#/; chomp; my $sc_abc = 0; my ($gix, $giy, $percent_id, $hsp_len, $mismatches, $gapsm, $qstart, $qend,$hstart, $hend, $e, $b) = split; my $s = 0; my $idx = getid($gix, 1); next unless $idx >= 0; if ($gix_prev ne $gix) { if (!$line_mode) { if ($gix_prev) { print $fh_raw "\$\n"; } if( $idx >= 0 ) { print $fh_raw "$idx "; } } } my $idy = getid($giy, 0); $s = getscore($e, $b, $hsp_len); if ($idy >= 0) { if (!$line_mode) { print $fh_raw "$idy:$s "; } else { print $fh_raw "$gix\t$giy\t$s\n"; } } $gix_prev = $gix; } if ($gix_prev) { print $fh_raw "\$\n" if !$line_mode; } } sub munge_long { my ($fh_in, $fh_raw, $line_mode) = @_; my $need_query = 1; my $need_hits = 2; my $need_gi = 3; my $state = $need_query; while (<$fh_in>) { $lc++; chomp; if (/^Query=\s+gi\|(\d+(_\d+)?)/ || /^Query=\s+(\S+).*$/) { # warn "STATE query $1\n"; if ($state != $need_query) { print STDERR "Unexpected 'Query=' line\n"; } $gix = $1; my $idx = getid($gix, 1); if ($idx >= 0) { print $fh_raw "$idx " if !$line_mode; } $state = $need_hits; } elsif (/^Query=/) { print STDERR "Query string not recognized: $_\n"; } elsif ($state == $need_hits && /sequences producing significant alignments/i) { $state = $need_gi; # warn "STATE significant\n"; } elsif ($state == $need_hits && /no hits found/i) { print STDERR "no hits found for gi $gix\n"; print $fh_raw "\$\n" if !$line_mode; $state = $need_query; } elsif ( $state == $need_gi && ! /^>/ && (/^gi\|(\d+(_\d+)?)/ || /^(\S+)\s+.*$/) ) { $giy = $1; my $idy = getid($giy, 0); my ($s, $b, $e); if (/(\S+)\s+(\S+)\s*$/) { $b = +$1; $e = +$2; } else { print STDERR "no scores in line $lc [$_]!\n"; next; } $s = getscore($e, $b, 0); if ($idy >= 0) { # fixme, void or explain. if (!$line_mode) { print $fh_raw "$idy:$s "; } else { print $fh_raw "$giy\t$gix\t$s\n"; } } } elsif (/^\s*$/) { # paragraph skip does not change state, including the $need_gi case. } elsif (/(Statistics|Parameters):/) { $state = $need_query; # this provides WU-blast compatibility. } elsif ($state == $need_gi) { print $fh_raw "\$\n" if !$line_mode; $state = $need_query; } } if ($state == $need_gi) { print $f_err "run ended while expecting more secondary scores\n"; print STDERR "run ended while expecting more secondary scores\n"; print $fh_raw "\$\n" if !$line_mode; } } sub read_tab { my $file = shift; my $tab = shift; open (U_TAB, "<$file") || die "cannot open $file\n"; while () { if (/^\s*#/) { next; } else { if (/^(\d+)\s+(.*)/) { $tab->{$2} = $1; } else { print STDERR "___ cannot parse line: $_"; } } } } sub getscore { my ($e, $b, $hl) = @_; my $s = 0; if ($::mode_score eq 'e') { $e = "1$e" if $e =~ /^e/; $s = $e > 0 ? -log($e)/log(10) : 200; if ($s > 200) { $s = 200; } $s = $s > $::ecut ? $s : 0; } elsif ($::mode_score eq 'b') { $s = $b > $::bcut ? $b : 0; } elsif ($::mode_score eq 'r' && $hl) { $s = $b / $hl; $s = $s > $::rcut ? $s : 0; } return $s; } sub getid { my ($gi, $is_a_query) = @_; my $id = -1; if ($user_tabfile) { if (defined($::TAB_user->{$gi})) { $id = $::TAB_user->{$gi}; } else { print STDERR "___ no user tab entry for label <$gi>\n"; return -1; } } else { if (!exists($tagTocc->{$gi})) { # warn "$is_a_query $gi <-> $::ID\n"; $tagTocc->{$gi} = $::ID++; } $id = $tagTocc->{$gi}; } $::seenrgt->{$id}++; if ($is_a_query) { $::seenlft->{$id}++; } return $id; } sub help { print <<_help_; Usage: mcxdeblast file-name where file-name is in BLAST NCBI format. mcxdeblast will create base.hdr [to be read by mcxassemble] base.raw [to be read by mcxassemble] base.map [to be read by mcxassemble] base.tab [to be read by clmformat] base.err [error log] where base is derived from or equal to file-name Options: --m9 Expect column (-m 9) input. --line-mode=abc Output simple ID1 ID2 SCORE format. --score= Use bit scores, E-values, or bit scores normalized by hsp-length --sort= Use alphabetic sorting (default) or occurrence. --tab= Use user-supplied tab file. --xi-dat= Strip from file-name to create output base name. --xo-dat= Add to base name. --bcut= Ignore hits below bit score --ecut= Ignore hits below E-value --rcut= Ignore hits below raw value --out= Output file name ('-' for STDOUT) _help_ }