haotu : an open lab notebook

2012/03/21

Large BLASTX XML import to BLAST2GO

Filed under: Uncategorized — S @ 05:48

I have a large blastx output file in xml that I want to import into blast2go.

1. Load your fasta file of sequences into blast2go.

2. Split up the large blastx file into multiple smaller files with the perl code found here:split_xml_blast

3. Usage: perl split_xml_blast 3000 myblastoutput.xml

4. The 3000 can be changed and indicates the number of sequences to place in each xml file.

 

 

here is the code:

#!/usr/bin/awk -f

# split big blast output in xml format into severals files
# type split_xml_blast without parameters to see usage.

BEGIN{

{
if (ARGC==3 && ARGV[1] !~ "^[a-zA-Z]+$")
{
# max is number of sequences per output file
max = ARGV[1]+0
ARGV[1]=""
} else
{
assert_exit = 1
usage()
}
}
cpt=nb=1
suffix=".xml"
end="\n"
begin="\n\n"
begin=begin "\nblastx\nblastx 2.2.18 [Mar-02-2008]\n"
begin=begin "\n/home/data/blastdb/nr\nlcl|1_0\n"
begin=begin "\n\n\n\nBLOSUM62\n"
begin=begin "0.1\n11\n1\nF\n"
begin=begin "\n\n"
}

function usage()
{

print "###################################################################################"
print "# split_xml_blast -- split big blast output in xml format into severals files. #"
print "# Performed in Awk v3.1 A.V. Aho, P.J. Weinberger, and B.W. Kernighan #"
print "# OS supported: *nix, Windows9x/NT #"
print "###################################################################################"
print "# Author: Laurent Manchon #"
print "# If you have comments or questions, send to the author at: #"
print "# lmanchon@univ-montp2.fr #"
print "###################################################################################"
print "# #"
print "# This program takes a file containing blast result in XML format and split #"
print "# it into severals small files, as: split_xml_blast #"
print "# with : Number of sequences per output file #"
print "# #"
print "###################################################################################"

exit 1
}

//{
split(FILENAME,prefix,".")
file=prefix[1] "_"
output_file=file nb suffix
i=1
if(cpt==1){print begin >> output_file}
print $0 >> output_file
next
}

i==1{print $0 >> output_file}

/<\/Iteration>/{
cpt++
if(cpt==max+1){
print end >> output_file
close (output_file)
nb++
cpt=1
i=0
next
}

}

END {
if (assert_exit) exit 1
print "\nYour input file",FILENAME,"has just been splitted into",nb,"files with",max,"sequences per file:\n"
cmd="ls -1 "file"*.xml"
system(cmd)
close(cmd)
}
Advertisements

Leave a Comment »

No comments yet.

RSS feed for comments on this post. TrackBack URI

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

Blog at WordPress.com.