"""
Using the motif module of biopython
Note that the python tutorial does not contain information about the
Motif package. Documentation is available in the directory
Doc/cookbook/motif of the biopython distribution.
"""
from Bio import Motif
# Let's create an instance of the E2F1 motif (downloaded from the
# jaspar database):
motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm")
# the format method displays the motif in a variety of formats:
print motif.format('transfac')
# You can also get a "logo" image that represents the motif:
motif.weblogo('e2f1.png')
# The length method does what we would expect:
print len(motif)
# The jaspar format for a motif specifies a matrix of counts: how many times
# each nucleotide occurs in each position of the motif.
# This information is stored in the 'counts' instance variable of a motif.
# Let's see what design choice the biopython folks did for storing this
# information:
motif.counts
# The matrix of counts is converted into a matrix of frequencies.
# This is called a position weight matrix (PWM).
# The PWM associated with a motif can be accessed as:
motif.pwm()
# Notice that the pwd assigns a non-zero frequency to every nucleotide at every
# position, even if none have been observed. Those are the result of what are
# called "pseuodo-counts" which will be explained next in the context of scoring
# a sequence with respect to the motif.
# The consensus sequence is the mostly like sequence represented by the motif.
# It can be generated by:
motif.consensus()
# Let's address the question of how well does a sequence matches the motif.
# We'll do that by assigning a score that is a sum of lod-odds ratios.
# A motif instance has a variable associated with it called _log_oddds:
motif._log_odds
# The maximum score that can be achieved for the motif
motif.max_score()
# By default the background is assumed to be uniform:
motif.background
def compute_background(file_name) :
from Bio import SeqIO
background = {'A':0.0, 'C':0.0, 'G':0.0, 'T':0.0}
count = 0.0
for seq_record in SeqIO.parse(file_name, "fasta") :
seq = seq_record.seq
for nucl in seq :
if nucl in background :
background[nucl] += 1
count += 1
for nucl in background :
background[nucl] /= count
return background
# Let's see that the max score changes when we modify the background:
motif.max_score()
motif.background = compute_background('human_promoter500.fasta')
motif._log_odds_is_current = False
motif.max_score()
# Let's look for matches of the motif in DNA sequences
from Bio.Seq import Seq
seq = Seq("AATTTCGCGCAA")
hits = motif.search_pwm(seq, threshold = 0.7 * max_score)
# Note that the search_pwm method returns an iterator.
# This iterator returns tuples that contain the position and score of all the hits:
for pos, score in hits :
print pos, score
# Let's now search for motif hits in the promoter regions of human genes
from Bio import Motif
motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm")
max_score = motif.max_score()
from Bio import SeqIO
for seq_record in SeqIO.parse('human_promoter500.fasta', "fasta") :
seq = seq_record.seq
id = seq_record.id
search_results = motif.search_pwm(seq, threshold=0.7 * max_score)
for pos, score in search_results :
print id, pos, score, '+'
# We need to understand better what it means when there is a hit
# on the negative strand
from Bio import Motif
motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm")
max_score = motif.max_score()
from Bio import SeqIO
for seq_record in SeqIO.parse('human_promoter500.fasta', "fasta") :
seq = seq_record.seq
id = seq_record.id
rc = seq.reverse_complement()
search_results = motif.search_pwm(seq, threshold=0.7 * max_score)
for pos, score in search_results :
if pos > 0 :
print id, pos, score, '+'
print seq[pos:pos + len(motif)]
else :
print id, pos, score, '-'
print seq[-pos : -pos + len(motif)]
search_results = motif.search_pwm(rc, threshold=0.7 * max_score, both=False)
for pos, score in search_results :
print id, pos, score, '-'
print rc[pos:pos + len(motif)]
