haotu : an open lab notebook

2012/03/07

Parse Filter BLAST output to CSV Excel

Filed under: BLAST, Genomics, Python, Transcriptomics — S @ 04:44

I have a really large BLAST output file that I would like to look at in EXCEL.

http://ged.msu.edu/angus/tutorials/unix-blast-and-long-running-jobs.html

This website seems to tell me how to do it! Thanks!!!

python parse-blast-to-csv.py BLASTresults.txt > out.csv

Here is the code (Do check with the original source above for a more full description and to download the code directly. Get the script + more by typing hg clone http://bitbucket.org/ctb/ngs-course):

parse-blast-to-csv.py

#! /usr/bin/env python
import sys
import csv
import blastparser

# get the filename as the first argument on the command line
filename = sys.argv[1]

# open it for reading
fp = open(filename)

# send output as comma-separated values to stdout
output = csv.writer(sys.stdout)

# parse BLAST records
for record in blastparser.parse_fp(fp):
    for hit in record:
        for match in hit.matches:
            # output each match as a separate row
            row = [record.query_name, hit.subject_name, match.score,
                   match.expect]
            output.writerow(row)

You will also need the following two modules:

Module 1

blastparser.py

#! /usr/bin/env python
"""
Yet Another BLAST parser for NCBI BLAST output.

Goals:

 - nice introspection
 - nice Pythonic accessibility
 - maintainability in the face of changing NCBI BLAST output formats

Sample usage: ::

   for record in parse_file('blast_output.txt'):
      print '-', record.query_name, record.database.name
      for hit in record.hits:
         print '--', hit.subject_name, hit.subject_length
         print '  ', hit.total_score, hit.total_expect
         for submatch in hit:
            print submatch.expect, submatch.bits

            print submatch.query_sequence
            print submatch.alignment
            print submatch.subject_sequence

Here, 'submatch' is a BlastObjectSubmatch; 'hit' is a BlastSubjectHits;
'record' is a BlastQuery; and 'record.database' is a BlastDatabase.  See
the docstrings below for attributes available on these objects.

Author: C. Titus Brown
"""

__version__ = 0.2

__all__ = ['BlastParser', 'parse_fp', 'parse_file', 'parse_string',
           'open_shelf']
__docformat__ = 'restructuredtext'

import math
from cStringIO import StringIO
import parse_blast

###

class BlastSubjectSubmatch(object):
    """
    BlastSubjectSubmatch.

    A specific submatch (score/alignment) of a query sequence to a
    subject sequence.

    Attributes:

     - expect
     - frame1
     - frame2
     - score
     - query_start
     - query_end
     - subject_start
     - subject_end
     - query_sequence
     - subject_sequence

    Usage: ::

        print submatch_obj.expect

    (etc.)

    """
#    __slots__ = ['expect', 'frame1', 'frame2',
#                 'query_start', 'query_end', 'query_sequence',
#                 'subject_start', 'subject_end', 'subject_sequence', 'identity']

    def __init__(self, expect, frame1, frame2,
                 q_start, q_end, q_seq, s_start, s_end, s_seq, identity, score):
        self.expect = math.pow(10, -expect)
        self.frame1 = frame1
        self.frame2 = frame2
        self.query_start = q_start
        self.query_end = q_end
        self.query_sequence = q_seq

        self.subject_start = s_start
        self.subject_end = s_end
        self.subject_sequence = s_seq
        self.score = score

    def __repr__(self):
        return ""\
               % (self.expect, self.query_start, self.query_end,
                self.subject_start, self.subject_end)

class BlastSubjectHits(object):
    """
    BlastSubjectHits.

    A list of all of the matches between a query sequence and a subject
    sequence.

    Attributes:
     * subject_name -- name of subject sequence.
     * matches -- list of BlastSubjectSubmatch objects.

    Usage: ::

        print hits_object.subject_name
        for match in hits_object:
           print match
    """
#    __slots__ = ['subject_name', 'matches' ]
    def __init__(self, subject_name, matches):
        self.subject_name = str(subject_name)
        self.matches = matches

    def __getitem__(self, i):
        return self.matches[i]

    def __len__(self):
        return len(self.matches)

    def __repr__(self):
        seqname = build_short_sequence_name(self.subject_name)
        return "" % (seqname, len(self))

class BlastQuery(object):
    """
    A BLAST query (single sequence against database) containing all results.

    Attributes:

      * query_name -- name of query sequence (following 'Query=').
      * hits -- a list of BlastSubjectHits, containing each match + alignment.

    Usage: ::

        print query_object.query_name
        for hits_object in query_object:
           print hits_object.subject_name
    """
#    __slots__ = ['query_name', 'hits' ]
    def __init__(self, query_name, hits):
        self.query_name = query_name
        self.hits = list(hits)

    def __repr__(self):
        query_short = build_short_sequence_name(self.query_name)
        return "" % (query_short, len(self.hits))

    def __len__(self):
        return len(self.hits)

    def __getitem__(self, i):
        return self.hits[i]

class _BlastShelf(object):
    def __init__(self, filename, mode='r'):
        from shelve import BsdDbShelf
        from bsddb import btopen

        _db = btopen(filename, 'r')
        self.db = BsdDbShelf(_db)

    def __iter__(self):
        db = self.db
        last_k, _ = db.last()
        k, v = db.first()
        while k != last_k:
            yield k, v
            k, v = db.next()
        yield k, v

def open_shelf(filename, mode='r'):
    from shelve import BsdDbShelf
    from bsddb import btopen

    return _BlastShelf(filename, mode)

def parse_file(filename):
    """
    Parse records from a given file; 'filename' is the path to the file.
    """
    b = BlastParser()
    for record in b.parse_file(filename):
        yield record

def parse_fp(fp, **kw):
    """
    Parse records out of the given file handle.
    """
    b = BlastParser()

    for record in b.parse_fp(fp, **kw):
        yield record

def parse_string(s):
    """
    Parse records out of a string buffer.
    """
    fp = StringIO(s)
    b = BlastParser()

    for record in b.parse_fp(fp):
        yield record

class _PygrBlastHitParser(parse_blast.BlastHitParser):
    def generate_intervals(self):
        yield self.query_id, self.subject_id, \
              BlastSubjectSubmatch(self.e_value,
                                   None,
                                   None,
                                   self.query_start,
                                   self.query_end,
                                   self.query_seq,
                                   self.subject_start,
                                   self.subject_end,
                                   self.subject_seq,
                                   self.identity_percent,
                                   self.blast_score)

class BlastParser(object):
    """
    BlastParser objects coordinate the use of pyparsing parsers to
    parse complete BLAST records.

    Attributes:

      * blast_record -- an individual BLAST record; returns BlastQuery object.
      * blast_output -- list of records; returns list of BlastQuery objects.

    Methods:

      * reset() -- clear the blast parser of persistent information.
      * parse_string(s)
      * parse_file(filename)
      * parse_fp(fp)
    """
    def __init__(self):
        self.p = _PygrBlastHitParser()

    def parse_file(self, filename):
        fp = open(filename)
        for record in self.parse_fp(fp):
            yield record

    def parse_fp(self, fp):

        subjects = []
        matches = []

        cur_query = None
        cur_subject = None

        for query_id, subject_id, submatch in self.p.parse_file(fp):
            if cur_subject != subject_id or cur_query != query_id:
                if matches:
                    assert cur_subject
                    subject_hits = BlastSubjectHits(cur_subject, matches)
                    subjects.append(subject_hits)
                    matches = []

                cur_subject = subject_id

            if cur_query != query_id:
                if cur_query:
                    assert subjects, cur_query
                    yield BlastQuery(cur_query, subjects)
                    subjects = []

                cur_query = query_id

            matches.append(submatch)

        if matches:
            subjects.append(BlastSubjectHits(cur_subject, matches))

        if subjects:
            yield BlastQuery(cur_query, subjects)

def build_short_sequence_name(name, max_len=20):
    if len(name) < max_len:         return name     name_l = name.split()     if len(name_l) > 1:
        return build_short_sequence_name(name_l[0], max_len)

    name = name_l[0]
    if len(name) > max_len:
        name = name[:max_len-3] + '...'
    return name

#####

if __name__ == '__main__':
    import sys
    from shelve import BsdDbShelf
    from bsddb import btopen
    from optparse import OptionParser

    ### read command line parameters

    parser = OptionParser()
    parser.add_option('-z', '--zlib-compressed', action="store_true",
                      dest="zlib_compressed",
                      help="read gzipped BLAST output file")

    parser.add_option('-n', '--ignore-empty-hits', action="store_true",
                      dest="ignore_empty_hits",
                      help="ignore BLAST hits with no results")

    (options, args) = parser.parse_args()

    (blast_file, output_file) = args

    ### open blast file, open/create database r/w

    if options.zlib_compressed:
        import gzip
        blast_fp = gzip.open(blast_file)
    else:
        blast_fp = open(blast_file)

    _db = btopen(output_file, 'c')
    db = BsdDbShelf(_db)

    ### go!

    for n, record in enumerate(parse_fp(blast_fp,
                                        ignore_no_hits=options.ignore_empty_hits)):
        if n % 100 == 0:
            print '...', n

        if options.ignore_empty_hits and not record:
            continue

        name = record.query_name
        db[name] = record

    print 'read %d records total' % (n + 1,)

Module 2

parse_blast.py

from __future__ import generators
import math

class CoordsGroupStart(object):
    pass

class CoordsGroupEnd(object):
    pass

# AUTHORS: zfierstadt, leec

def is_line_start(token,line):
    "check whether line begins with token"
    return token==line[:len(token)]

def get_ori_letterunit(start,end,seq,gapchar='-'):
    """try to determine orientation (1 or -1) based on whether start>end,
    and letterunit (1 or 3) depending on the ratio of end-start difference
    vs the actual non-gap letter count.  Returns tuple (ori,letterunit)"""
    if end>start:
        ori=1
    else:
        ori= -1
    ngap=0
    for l in seq:
        if l==gapchar:
            ngap+=1
    seqlen=len(seq)-ngap
    if ori*float(end-start)/seqlen >2.0:
        letterunit=3
    else:
        letterunit=1
    return ori,letterunit

class BlastIval(object):
    def __repr__(self):
        return '<BLAST-IVAL: '  + repr(self.__dict__) + '>'

class BlastHitParser(object):
    """reads alignment info from blastall standard output.
    Method parse_file(fo) reads file object fo, and generates tuples
    suitable for BlastIval.

    Attributes:
            query_seq
            query_start
            query_end
            subject_seq
            subject_start
            subject_end
            query_id
            subject_id
            e_value
            blast_score
            identity_percent
    """
    gapchar='-'
    def __init__(self):
        self.hit_id=0
        self.nline = 0
        self.reset()
    def reset(self):
        "flush any alignment info, so we can start reading new alignment"
        self.query_seq=""
        self.subject_seq=""
        self.hit_id+=1
    def save_query(self,line):
        self.query_id=line.split()[1]
    def save_subject(self,line):
        self.subject_id=line.split()[0][1:]
    def save_score(self,line):
        "save a Score: line"
        self.blast_score=float(line.split()[2])
        s=line.split()[7]
        if s[0]=='e':
            s='1'+s
        if s.endswith(','): s = s.strip(',')
        try:
            self.e_value= -math.log(float(s))/math.log(10.0)
        except (ValueError,OverflowError), e:
            self.e_value=300.
    def save_identity(self,line):
        "save Identities line"
        s=line.split()[3][1:]
        self.identity_percent=int(s[:s.find('%')])
    def save_query_line(self,line):
        "save a Query: line"
        c=line.split()
        self.query_end=int(c[3])
        if not self.query_seq:
            self.query_start=int(c[1])
            if self.query_start < self.query_end:  # handles forward orientation
                self.query_start -= 1
        self.query_seq+=c[2]
        self.seq_start_char=line.find(c[2], 5) # IN CASE BLAST SCREWS UP Sbjct:
    def save_subject_line(self,line):
        "save a Sbjct: line, attempt to handle various BLAST insanities"
        c=line.split()
        if len(c)0: # HANDLE TBLASTN SCREWINESS: Sbjct SEQ OFTEN TOO SHORT!!
            # THIS APPEARS TO BE ASSOCIATED ESPECIALLY WITH STOP CODONS *
            self.subject_seq+=lendiff*'A' # EXTEND TO SAME LENGTH AS QUERY...
        elif lendiff=0: # END OF AN UNGAPPED INTERVAL
                    yield self.get_interval_obj(q_start, i_query,
                                                s_start, i_subject,
                                                query_ori, query_factor,
                                                subject_ori, subject_factor)
                q_start= -1
            elif q_start=0: # REPORT THE LAST INTERVAL
            yield self.get_interval_obj(q_start, i_query,
                                        s_start, i_subject,
                                        query_ori, query_factor,
                                        subject_ori, subject_factor)

        yield CoordsGroupEnd()

    def parse_file(self,myfile):
        "generate interval tuples by parsing BLAST output from myfile"
        for line in myfile:
            self.nline += 1
            if self.is_valid_hit() and \
               (is_line_start('>',line) or is_line_start(' Score =',line) \
                or is_line_start('  Database:',line) \
                or is_line_start('Query=',line)):
                for t in self.generate_intervals(): # REPORT THIS ALIGNMENT
                    yield t # GENERATE ALL ITS INTERVAL MATCHES
                self.reset() # RESET TO START A NEW ALIGNMENT
            if is_line_start('Query=',line):
                self.save_query(line)
            elif is_line_start('>',line):
                self.save_subject(line)
            elif is_line_start(' Score =',line):
                self.save_score(line)
            elif 'Identities =' in line:
                self.save_identity(line)
            elif is_line_start('Query:',line):
                self.save_query_line(line)
            elif is_line_start('Sbjct:',line):
                self.save_subject_line(line)
        if self.nline == 0: # no blast output??
            raise IOError('no BLAST output.  Check that blastall is in your PATH')

if __name__=='__main__':
    import sys
    p=BlastHitParser()
    for t in p.parse_file(sys.stdin):
        print t
Advertisements

Leave a Comment »

No comments yet.

RSS feed for comments on this post. TrackBack URI

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Create a free website or blog at WordPress.com.

%d bloggers like this: