Commit 6b097650 authored by Pauline Pommeret's avatar Pauline Pommeret
Browse files

[file_tools.py] Fixing load_md_data (regexp fail)

parent 26c69bec
......@@ -15,6 +15,8 @@ def load_fasta(sequence_alphabet, path):
Arguments:
* alphabet : the IUPAC
* path : path to FASTA file
Returns a Bio.SeqRecord.SeqRecord object
"""
return SeqIO.read(path, format="fasta", alphabet=sequence_alphabet)
......@@ -23,32 +25,52 @@ def load_md_data(path):
"""
"""
positions = ""
positions = []
output = []
with open(path, 'r') as handle:
# Fetch the sequence positions from the first line
# Retrieves nucleotides positions from the "header" line
# The header is the first line
header = handle.readline()
# Getting rid of the "\n" at the end of the line and spliting to get
# this kind of list ['frame', 'C16/C17', 'C17/G18', ..., 'A130/G131']
header = header.replace('\n', '').split()
# Working on 'frame' (header[0]) or string like 'C17/G18'
for c in header:
positions += re.sub(r'[0-9]', '', c.split('/')[0])
# Uses the string at the left side of the '/'
positions.append(re.sub(r'[A-Z]', '', c.split('/')[0]))
# Pour avoir le dernier élément (quand même)
positions += re.sub(r'[0-9]', '', header[-1].split('/')[1])
# Retrieves the last element (never on the left side of the '/')
positions.append(re.sub(r'[A-Z]', '', header[-1].split('/')[1]))
# Pour enlever 'frame' parce qu'il parait que c'est pas dans l'ADN
positions = re.sub(r'frame', '', positions)
# Getting rid of 'frame' (first word of the file)
positions.pop(0)
# Obtenir un json.dump du fichier en entier
# Starting to work on the 'real' data (frame 0)
line = handle.readline()
# Processing every data line in the file
while line:
# Getting a list of the helicoidal parameter value
# ['frame number', '-10.9', ..., '5.2']
line = line.replace('\n', '').split()
newline = [float(x) for x in line]
newline = [[float(re.sub(r'[A-Za-z]', '', header[x].split('/')[0])), newline[x]] for x in range(1, len(newline))]
# Casting the frame number to int and the helicoidal parameter
# values to float for further use
newline = [int(line[0]]) + [float(x) for x in line[1:]]
# Creating a list looking like
# [..., ['position_i', 'helicoidal_parameter_value_i'], ...]
# There is a shift between positions (contains frame number) and
# line (doesn't)
newline = [[int(positions[x]), line[x+1]] for x in xrange(len(positions))]
# Inserting the list ["frame", frame number] at first position
# (for developper convenience)
newline.insert(0, ["frame", line[0]])
# Saving the processed line to the output
output.append(newline)
# Next line
line = handle.readline()
return (positions, output)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment