""" Using the motif module of biopython Note that the python tutorial does not contain information about the Motif package. Documentation is available in the directory Doc/cookbook/motif of the biopython distribution. """ from Bio import Motif # Let's create an instance of the E2F1 motif (downloaded from the # jaspar database): motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm") # the format method displays the motif in a variety of formats: print motif.format('transfac') # You can also get a "logo" image that represents the motif: motif.weblogo('e2f1.png') # The length method does what we would expect: print len(motif) # The jaspar format for a motif specifies a matrix of counts: how many times # each nucleotide occurs in each position of the motif. # This information is stored in the 'counts' instance variable of a motif. # Let's see what design choice the biopython folks did for storing this # information: motif.counts # The matrix of counts is converted into a matrix of frequencies. # This is called a position weight matrix (PWM). # The PWM associated with a motif can be accessed as: motif.pwm() # Notice that the pwd assigns a non-zero frequency to every nucleotide at every # position, even if none have been observed. Those are the result of what are # called "pseuodo-counts" which will be explained next in the context of scoring # a sequence with respect to the motif. # The consensus sequence is the mostly like sequence represented by the motif. # It can be generated by: motif.consensus() # Let's address the question of how well does a sequence matches the motif. # We'll do that by assigning a score that is a sum of lod-odds ratios. # A motif instance has a variable associated with it called _log_oddds: motif._log_odds # The maximum score that can be achieved for the motif motif.max_score() # By default the background is assumed to be uniform: motif.background def compute_background(file_name) : from Bio import SeqIO background = {'A':0.0, 'C':0.0, 'G':0.0, 'T':0.0} count = 0.0 for seq_record in SeqIO.parse(file_name, "fasta") : seq = seq_record.seq for nucl in seq : if nucl in background : background[nucl] += 1 count += 1 for nucl in background : background[nucl] /= count return background # Let's see that the max score changes when we modify the background: motif.max_score() motif.background = compute_background('human_promoter500.fasta') motif._log_odds_is_current = False motif.max_score() # Let's look for matches of the motif in DNA sequences from Bio.Seq import Seq seq = Seq("AATTTCGCGCAA") hits = motif.search_pwm(seq, threshold = 0.7 * max_score) # Note that the search_pwm method returns an iterator. # This iterator returns tuples that contain the position and score of all the hits: for pos, score in hits : print pos, score # Let's now search for motif hits in the promoter regions of human genes from Bio import Motif motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm") max_score = motif.max_score() from Bio import SeqIO for seq_record in SeqIO.parse('human_promoter500.fasta', "fasta") : seq = seq_record.seq id = seq_record.id search_results = motif.search_pwm(seq, threshold=0.7 * max_score) for pos, score in search_results : print id, pos, score, '+' # We need to understand better what it means when there is a hit # on the negative strand from Bio import Motif motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm") max_score = motif.max_score() from Bio import SeqIO for seq_record in SeqIO.parse('human_promoter500.fasta', "fasta") : seq = seq_record.seq id = seq_record.id rc = seq.reverse_complement() search_results = motif.search_pwm(seq, threshold=0.7 * max_score) for pos, score in search_results : if pos > 0 : print id, pos, score, '+' print seq[pos:pos + len(motif)] else : print id, pos, score, '-' print seq[-pos : -pos + len(motif)] search_results = motif.search_pwm(rc, threshold=0.7 * max_score, both=False) for pos, score in search_results : print id, pos, score, '-' print rc[pos:pos + len(motif)]