"""
Using the motif module of biopython
Note that the python tutorial does not contain information about the
Motif package.  Documentation is available in the directory
Doc/cookbook/motif of the biopython distribution.
"""


from Bio import Motif

# Let's create an instance of the E2F1 motif (downloaded from the
# jaspar database):
motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm")

# the format method displays the motif in a variety of formats:
print motif.format('transfac')

# You can also get a "logo" image that represents the motif:
motif.weblogo('e2f1.png')

# The length method does what we would expect:
print len(motif)

# The jaspar format for a motif specifies a matrix of counts:  how many times
# each nucleotide occurs in each position of the motif.
# This information is stored in the 'counts' instance variable of a motif.
# Let's see what design choice the biopython folks did for storing this
# information:
motif.counts

# The matrix of counts is converted into a matrix of frequencies.
# This is called a position weight matrix (PWM).
# The PWM associated with a motif can be accessed as:

motif.pwm()

# Notice that the pwd assigns a non-zero frequency to every nucleotide at every
# position, even if none have been observed.  Those are the result of what are
# called "pseuodo-counts" which will be explained next in the context of scoring
# a sequence with respect to the motif.

# The consensus sequence is the mostly like sequence represented by the motif.
# It can be generated by:
motif.consensus()

# Let's address the question of how well does a sequence matches the motif.
# We'll do that by assigning a score that is a sum of lod-odds ratios.
# A motif instance has a variable associated with it called _log_oddds:

motif._log_odds

# The maximum score that can be achieved for the motif
motif.max_score()

# By default the background is assumed to be uniform:
motif.background

def compute_background(file_name) :
    from Bio import SeqIO
    background = {'A':0.0, 'C':0.0, 'G':0.0, 'T':0.0}
    count = 0.0
    for seq_record in SeqIO.parse(file_name, "fasta") :
        seq = seq_record.seq
        for nucl in seq :
            if nucl in background :
                background[nucl] += 1
            count += 1
    for nucl in background :
        background[nucl] /= count
    return background

# Let's see that the max score changes when we modify the background:
motif.max_score()
motif.background = compute_background('human_promoter500.fasta')
motif._log_odds_is_current = False        
motif.max_score()



# Let's look for matches of the motif in DNA sequences
from Bio.Seq import Seq
seq = Seq("AATTTCGCGCAA")
hits = motif.search_pwm(seq, threshold = 0.7 * max_score)
# Note that the search_pwm method returns an iterator.
# This iterator returns tuples that contain the position and score of all the hits:
for pos, score in hits :
    print pos, score

# Let's now search for motif hits in the promoter regions of human genes
from Bio import Motif
motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm")
max_score = motif.max_score()
from Bio import SeqIO
for seq_record in SeqIO.parse('human_promoter500.fasta', "fasta") :
    seq = seq_record.seq
    id = seq_record.id
    search_results = motif.search_pwm(seq, threshold=0.7 * max_score)
    for pos, score in search_results :
        print id, pos, score, '+'

# We need to understand better what it means when there is a hit
# on the negative strand
from Bio import Motif
motif=Motif.read(open("MA0024.1.pfm"), "jaspar-pfm")
max_score = motif.max_score()
from Bio import SeqIO
for seq_record in SeqIO.parse('human_promoter500.fasta', "fasta") :
    seq = seq_record.seq
    id = seq_record.id
    rc = seq.reverse_complement()
    search_results = motif.search_pwm(seq, threshold=0.7 * max_score)
    for pos, score in search_results :
        if pos > 0 :
            print id, pos, score, '+'
            print seq[pos:pos + len(motif)]
        else :
            print id, pos, score, '-'
            print seq[-pos : -pos + len(motif)]
    search_results = motif.search_pwm(rc, threshold=0.7 * max_score, both=False)
    for pos, score in search_results :
        print id, pos, score, '-'
        print rc[pos:pos + len(motif)]