#!/usr/bin/perl -w # AUTHOR: Jason Stajich # for use in filling in holes in all-vs-all pairwise whole genome BLASTPs # This is for generating jobs for qsub (SGE) and is a little # hacked over from other scripts so it may not make a ton of sense... sorry. use strict; use Bio::SeqIO; use File::Spec; use Getopt::Long; use Env; my $statusfile = File::Spec->catfile($HOME,"status.dat"); # STATUS FILE LOOKS LIKE THIS AND IS GENERATED BY DOING AN 'ls' IN THE DIRECTORY WITH THE RESULT FILES # IT IS STORED IN A FILE SINCE I RUN JOBS ON SYSTEM SEPARATE FROM THE STORAGE # arabidopsis_thaliana_TAIR7-vs-arabidopsis_thaliana_TAIR7.BLAST.tab.bz2 # arabidopsis_thaliana_TAIR7-vs-aspergillus_fumigatus_Af293.BLAST.tab.bz2 # arabidopsis_thaliana_TAIR7-vs-aspergillus_nidulans_A4_1.BLAST.tab.bz2 # arabidopsis_thaliana_TAIR7-vs-batrachochytrium_dendrobatidis_JAM81_4.BLAST.tab.bz2 my $qdir; my $home = File::Spec->rel2abs($ENV{'HOME'}); my $jobdir = File::Spec->catfile($home,'jobs'); my $input = File::Spec->catfile($home,'input'); my $outdir = File::Spec->catfile($home,'output'); my $tmpdir = File::Spec->catfile($home, 'tmp'); my $scratchdir = File::Spec->catfile('/scratch',$ENV{'USER'}); my $prefix = 'WUBLAST'; my $nousescript = 0; my @sge_extra; my $cleanup = 1; my $extension = "pep.fa"; my $exe = File::Spec->catfile($HOME, "src", "wu-blast", "blastp"); # ADD A Z=1000000 or something in the ballpark number of proteins if you are using these # E-values directly from the search. I am using these results as jumping off point for additional # SSEARCH based shuffled E-values for my own work. my $args = "-cpus 1 -links -postsw -topcomboN 1 E=1e-3 B=1000 V=0 -wordmask seg+xnu -mformat 3"; my $target_sp = ''; my $zip = 'bzip2'; my $zipext = 'bz2'; my $Proteinstatus = 'T'; my $formatdb = 'wu-formatdb'; GetOptions( 'j|job:s' => \$jobdir, 'o|output:s' => \$outdir, 't|tmp:s' => \$tmpdir, 'i|q|query:s'=> \$qdir, 'p|params:s' => \$args, 'ext:s' => \$extension, 'exe:s' => \$exe, 'status:s' => \$statusfile, 'cleanup!' => \$cleanup, ); die("need a qdir\n") unless $qdir; opendir(DIR,$qdir) || die $!; my %skip; if( -f $statusfile ) { open(IN, $statusfile) || die $!; while() { my ($pair) = split; my ($q,$h) = split(/-vs-/,$pair); $skip{$q}->{$h}++; } } for my $d ( $outdir, $tmpdir, $input, $jobdir) { mkdir($d) unless -d $d; } my @dbs; for my $file ( readdir(DIR) ) { next unless( $file =~ /(\S+)\.\Q$extension\E$/ ); push @dbs, [$1,$file,File::Spec->catfile($qdir,$file)]; } my $jobdirbase = File::Spec->catfile($jobdir); mkdir($jobdirbase) unless -d $jobdirbase; for my $qdb ( @dbs ) { my ($q,$qfile,$qfilepath) = @$qdb; my ($qgen,$qsp) = split(/_/,$q); my $qpref = sprintf("%s%s",substr($qgen,0,2), substr($qsp,0,3)); my @queries = $qfile; for my $tdb ( @dbs ) { my ($target,$tfile,$tfilepath) =@$tdb; my ($tgen,$tsp) = split(/_/,$target); my $tpref = sprintf("%s%s",substr($tgen,0,2), substr($tsp,0,3)); my $jobname = sprintf("$qpref-$tpref"); my $odir = File::Spec->catfile($outdir); my $i = 0; for my $qi ( @queries ) { next if ($skip{$q}->{$target}); if( $target_sp ) { next unless( $q eq $target || ($q =~ /\Q$target_sp\E/ || $target =~ /\Q$target_sp\E/ )); } my $out = File::Spec->catfile($tmpdir,"$jobname.out"); # warn("would process $jobdirbase.$i.sh\n"); #next; if( -f "$jobdir/$jobname.sh" ) { warn("overwriting $jobname\n"); } my $wkdir = File::Spec->catdir($scratchdir, "working\$\$"); my $ofiletmp = File::Spec->catfile($wkdir,"$q-vs-$target.BLAST.tab"); open(JOB, ">$jobdirbase/$jobname.sh") || die $!; print JOB join("\n","#!/bin/bash", '#$'." -o $out", '#$'." -j y ", '#$'." -S /bin/bash", '#$'." -N $jobname", '#$'." -soft -l arch=lx26-amd64", (map { '#$ '.$_ } @sge_extra), "source ~/.bash_profile", "mkdir mkdir -p $wkdir"), "\n"; print JOB "hostname\n"; print JOB join("\n","cd $wkdir", "cp $qfilepath $tfilepath $wkdir", "$formatdb -i $tfile -p $Proteinstatus", "$exe $args -i $qfile -d $tfile -o $ofiletmp", "$zip $ofiletmp", "mv $ofiletmp.$zipext $outdir", ),"\n"; if ($cleanup) { print JOB "rm -rf $wkdir\n" } close(JOB); } } }