#!/usr/bin/env python2.7 # XXX """ XXX * match local * sur toute la séquence * totex """ def pair_score(string, TRX): """ Tries to match ``string`` (a 2 or 4 letter long string) with the patterns in ``TRX`` (TRX scale dictionary). Returns the value associated to ``string`` or raises a NoMatch Exception """ potential = [] if len(string) == 4: # Iteration on the keys of the TRX dictionary for pattern in TRX.keys(): # Test whether the 2 central nucleotides are in the pattern if re.search(string[1:3], pattern): # Only one pattern 2 letters long is possible and it must be kept # in case the tetranucleotides don't match the whole string if len(pattern) == 2: potential.append(pattern) # Only one 4 letters long pattern is possible. elif re.search(string[0], pattern[1:3]) and re.search(string[-1], pattern[7:9]): potential.append(pattern) elif len(string) == 2: # Iteration on the keys of the TRX dictionary which have a length equal 2 for pattern in [i for i in TRX.keys() if len(i)==2]: if re.search(string, pattern): potential.append(pattern) if not potential: # If the potential list is empty, clearly something gone wrong raise NoMatch("No match found, please check your sequence and TRX scale.") else: # There might be up to 2 patterns in `potential`: a 2 letter long and a # 4 letter long. The selected pattern is the longest one (specificity) return TRX[max(potential)] def match(sequence, TRX): """ Translates a raw ``sequence`` into a list of values of ``TRX``. Position i studies phosphate between nlt i and i+1 """ sequence_TRX = {} # Use of str(sequence) instead of sequence.tostring() because the # documentation says so nowaday (25/11/2014) sequence = str(sequence) for position in xrange(0, len(sequence)-1): if position == 0: sequence_TRX[position] = pair_score(sequence[position:position+2], TRX) elif position == len(sequence)-2: sequence_TRX[position] = pair_score(sequence[position:position+2], TRX) else: sequence_TRX[position] = pair_score(sequence[position-1:position+3], TRX) return sequence_TRX def sliding_pb(sequence_TRX, shift=72): """ Uses ``sequence_TRX`` (dictionary where keys are position in sequence and values are TRX values associated with the nucleotides) and computes the mean value on ``shift`` (int) sliding bp. Default ``shift=72``. Returns a dictionary. """ # Test if sequence_TRX and shift have compatible length if len(sequence_TRX) > shift: sliding_TRX = {} for position in xrange(0, len(sequence)-shift): sliding_TRX[position] = numpy.round(numpy.mean([sequence_TRX[i] for i in xrange(position, position+shift)]), decimals=2) return sliding_TRX elif: raise ShiftOutOfRange("Choosen shift is too long for query sequence.")