Commit ed0d50e2 authored by Pauline Pommeret's avatar Pauline Pommeret
Browse files

Way too much stuff.

parent 92fd9acc
# TRX scale in use in november 2012
# tetranucleotides
YCGR 59
RCGR 51
YCGY 51
RCGY 38
YCAR 59
RCAR 30
YCAY 40
RCAY 27
YTGR 59
RTGR 40
YTGY 30
RTGY 27
# dinucleotides
CG 45
CA 39
TG 39
TA 14
AA 5
AG 10
GA 23
GG 42
CC 42
CT 10
TC 23
TT 5
AC 4
AT 0
GT 4
GC 30
......@@ -3,13 +3,14 @@
"""
"""
import os
import math
import json
import pandas
import lib.fft
import lib.fft_tools
STUDENT_FILE = "student.json"
STUDENT_FILE = os.path.join(os.path.dirname(__file__), "../data/student.json")
def compute_correlations(_md_param1, _md_param2, helicoidal_parameter_1, helicoidal_parameter_2, alpha=0.05, centering=72):
"""
......@@ -32,7 +33,7 @@ def fetch_student(alpha, ddl):
student_dict = json.load(student_file)
if alpha not in student_dict:
raise KeyError("alpha = %r is not in the Student table" % (alpha,))
raise KeyError("alpha = %r is not in the Student table" % (alpha,))
if ddl not in student_dict[alpha]:
# Using usual rounding rules
......@@ -48,9 +49,9 @@ def build_dataframe(helicoidal_parameter_1_data, helicoidal_parameter_2_data, he
"""
dataframe = {
helicoidal_parameter_1: lib.fft.split_frame(helicoidal_parameter_1_data)[1][1],
helicoidal_parameter_2: lib.fft.split_frame(helicoidal_parameter_2_data)[1][1],
}
helicoidal_parameter_1: lib.fft_tools.split_frame(helicoidal_parameter_1_data)[1][1],
helicoidal_parameter_2: lib.fft_tools.split_frame(helicoidal_parameter_2_data)[1][1],
}
return pandas.DataFrame(dataframe)
......@@ -59,19 +60,15 @@ def stats_test(dataframe, helicoidal_parameter_1, helicoidal_parameter_2, alpha=
"""
result = {
"spearman": {
"complete": False,
"center": False,
},
"pearson": {
"complete": False,
"center": False,
},
}
"spearman_complete": False,
"spearman_center": False,
"pearson_complete": False,
"pearson_center": False,
}
seq_length = len(dataframe[helicoidal_parameter_1])
for test_type in result:
for chan in result[test_type]:
for test_type in ["spearman", "pearson"]:
for chan in ["complete", "center"]:
if chan == "complete":
length = seq_length
offset = 0
......@@ -88,7 +85,7 @@ def stats_test(dataframe, helicoidal_parameter_1, helicoidal_parameter_2, alpha=
t_alpha = fetch_student(alpha, length - 2)
if -t_alpha < test < t_alpha:
result[test_type][chan] = False
result["%s_%s" % (test_type, chan)] = False
else:
result[test_type][chan] = True
result["%s_%s" % (test_type, chan)] = True
return result
#!/usr/bin/env python2.7
import psycopg2
import psycopg2.extras
class PGCursor(object):
"""
PostGreSQL Python Cursor.
"""
def __init__(self):
"""
Starts a psycopg2 cursor.
"""
self._conn = psycopg2.connect(database='itpp')
self._conn.set_session(autocommit = True)
self._cur = self._conn.cursor(cursor_factory = psycopg2.extras.DictCursor)
def store_sequence(self, seq):
"""
Store the sequence data in database
"""
main_seq_query = """INSERT INTO
sequences
(an, name, description, sequence, group, alphabet, alpha, sliding, centering, trx_scale_path)
VALUES
(%(an)s, %(name)s, %(description)s, %(sequence)s, %(group)s, %(alphabet)s, %(alpha)s, %(sliding)s, %(centering)s, %(trx_scale_path)s)
RETURNING
id;"""
self._cur.execute(main_seq_query, seq.seq_dict)
seq_id = self._cur.fetchone()[0]
for helicoidal_parameter in seq.md:
for (frame_num, values) in seq.md[helicoidal_parameter].iteritems():
md_query = """INSERT INTO
md_%s
(seq_id, frame_num, complete_peak_freq, complete_peak, complete_size, center_peak_freq, center_peak, center_size, sliding_peak_freq, sliding_peak, sliding_size)
VALUES
(%s, %s, %%(complete_peak_freq)s, %%(complete_peak)s, %%(complete_size)s, %%(center_peak_freq)s, %%(center_peak)s, %%(center_size)s, %%(sliding_peak_freq)s, %%(sliding_peak)s, %%(sliding_size)s);""" % (helicoidal_parameter, seq_id, frame_num)
self._cur.execute(md_query, values)
trx_query = """INSERT INTO
trx
(seq_id, complete_peak_freq, complete_peak, complete_size, center_peak_freq, center_peak, center_size, sliding_peak_freq, sliding_peak, sliding_size)
VALUES
(%(seq_id)s, %(complete_peak_freq)s, %(complete_peak)s, %(complete_size)s, %(center_peak_freq)s, %(center_peak)s, %(center_size)s, %(sliding_peak_freq)s, %(sliding_peak)s, %(sliding_size)s);"""
dic_to_sql = dict(seq.trx)
dic_to_sql.update({'seq_id': seq_id})
self._cur.execute(trx_query, seq.trx)
for (correl_types, bunch_of_data) in seq.correlation:
type_a, type_b = correl_types.split("/")
for (frame_num, data) in bunch_of_data:
corr_query = """INSERT INTO
correlations
(seq_id, frame_num, type_a, type_b, spearman_complete, spearman_center, pearson_complete, pearson_center)
VALUES
(%(seq_id)s, %(frame_num)s, %(type_a)s, %(type_b)s, %(spearman_complete)s, %(spearman_center)s, %(pearson_complete)s, %(pearson_center)s);"""
dic_to_sql = dict(data)
dic_to_sql.update({
"frame_num": frame_num,
"seq_id": seq_id,
"type_a": type_a,
"type_b": type_b,
})
self._cur.execute(corr_query, dic_to_sql)
......@@ -9,6 +9,9 @@ import numpy
from lib.XylokExceptions import ShiftOutOfRange
def split_frame(frame):
"""Split a frame and reorder x and y tables
"""
x = []
y = []
......@@ -75,14 +78,14 @@ def fft(frame, centering=None):
return (frame_number, list(freqs), list(transform))
def get_noticeable_data(fft):
def get_noticeable_data(fft_data):
"""
Fetches (argmax, max) in fft data, and the size of the peak
associated with this max.
"""
frame_number, freqs, transform = fft
_, freqs, transform = fft_data
# We do a reverse copy of transform because we want
# to fetch the greater peak_index.
......@@ -135,7 +138,7 @@ def sliding_fft(frame, window=72):
modified_x = []
modified_y = []
frame_number, (x, y) = split_frame(frame)
frame_number, (_, y) = split_frame(frame)
if len(y) <= window:
raise ShiftOutOfRange("Choosen shift is too long for query sequence.")
......@@ -144,7 +147,7 @@ def sliding_fft(frame, window=72):
modified_x.append(i)
modified_y.append(numpy.round(numpy.mean(y[i:i+window]), 3))
modified_frame = { i: j for (i, j) in zip(modified_x, modified_y)}
modified_frame = {i: j for (i, j) in zip(modified_x, modified_y)}
modified_frame['frame'] = frame_number
return fft(modified_frame)
......
......@@ -19,6 +19,12 @@ def load_fasta(sequence_alphabet, path):
Returns a Bio.SeqRecord.SeqRecord object
"""
if sequence_alphabet.lower() == 'dna':
sequence_alphabet = IUPAC.unambiguous_dna
else:
raise NotImplementedError
return SeqIO.read(path, format="fasta", alphabet=sequence_alphabet)
def load_md_data(path):
......
#!/usr/bin/env python2.7
# XXX
"""
XXX
* match local
* sur toute la séquence
* totex
Docstring
"""
import os
# This dictionary is there to translate TRX scale
nucleotide_map = {"Y" : "[CT]", "R" : "[AG]"}
scale_file = os.path.join(os.path.dirname(__file__), '../data/trx_scale')
def translate_trx_scale(pattern, n_map):
"""
Translates a matching pattern so that it can be used with regexps.
It uses a nucleotide map.
Example: "YCGR" becomes "[CT]CG[AG]"
"""
# NB: D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.
return "".join([n_map.get(i, i) for i in pattern])
def parse_trx_scale(path_to_scale):
"""
Parses a TRX scale file and creates a dictionary with the scale. It uses
:py:module:trx_tools:translate_trx_scale in order to use unambiguous DNA
alphabet later.
Returns a TRX dictionary that looks like this:
TRX = {'AA' : 5.0, ..., '[CT]CG[AG]': 59.0, ...}
"""
TRX = {}
with open(path_to_scale, "r") as handle:
line = handle.readline()
# Processing every data line
while line:
# Not taking into account empty lines and lines starting with
# # (comments)
if line.strip() != "" and (line.startswith("#") == False):
# line.split()[0]: the nucleotide string (2 or 4 letters) that
# needs translation, using
# :py:module:lib/trx_tools:translate_trx_scale
# line.split()[1]: the associated value that needs to be
# cast into float
TRX[translate_trx_scale(line.split()[0], nucleotide_map)] = float(line.split()[1])
line = handle.readline()
# At this point, we have a TRX dictionary that looks like this:
# TRX = {'AA' : 5.0, ..., '[CT]CG[AG]': 59.0, ...}
return TRX
def pair_score(string, TRX):
"""
Tries to match ``string`` (a 2 or 4 letter long string) with the patterns
......@@ -89,7 +134,7 @@ def sliding_pb(sequence_TRX, shift=72):
for position in xrange(0, len(sequence)-shift):
sliding_TRX[position] = numpy.round(numpy.mean([sequence_TRX[i] for i in xrange(position, position+shift)]), decimals=2)
return sliding_TRX
elif:
else:
raise ShiftOutOfRange("Choosen shift is too long for query sequence.")
#!/usr/bin/env python2.7
# XXX
"""
Docstring
"""
# This dictionary is there to translate TRX scale
nucleotide_map = { "Y" : "[CT]", "R" : "[AG]"}
def translate_trx_scale(pattern, nucleotide_map):
"""
Translates a matching pattern so that it can be used with regexps.
It uses a nucleotide map.
Example: "YCGR" becomes "[CT]CG[AG]"
"""
# NB: D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.
return "".join([nucleotide_map.get(i,i) for i in pattern])
def parse_trx_scale(path_to_scale):
"""
Parses a TRX scale file and creates a dictionary with the scale. It uses
:py:module:trx_tools:translate_trx_scale in order to use unambiguous DNA
alphabet later.
Returns a TRX dictionary that looks like this:
TRX = {'AA' : 5.0, ..., '[CT]CG[AG]': 59.0, ...}
"""
TRX = {}
with open(path_to_scale, "r") as handle:
line = handle.readline()
# Processing every data line
while line:
# Not taking into account empty lines and lines starting with
# # (comments)
if line.strip() != "" and (line.startswith("#") == False):
# line.split()[0]: the nucleotide string (2 or 4 letters) that
# needs translation, using
# :py:module:lib/trx_tools:translate_trx_scale
# line.split()[1]: the associated value that needs to be
# cast into float
TRX[translate_trx_scale(line.split()[0], nucleotide_map)] = float(line.split()[1])
line = handle.readline()
# At this point, we have a TRX dictionary that looks like this:
# TRX = {'AA' : 5.0, ..., '[CT]CG[AG]': 59.0, ...}
return TRX
......@@ -4,26 +4,25 @@
Un fichier pour faire des tests
"""
import os
import argparse
import sequence
if __name__ == "__main__":
# Use argparse to parse arguments. We first create a parser.
parser = argparse.ArgumentParser(description="ToDo")
# Adding arguments to the parser
parser.addArgument("-a", "--alphabet", type=str, help="Alphabet (DNA, RNA, prot): Unambiguous IUPAC alphabet that will be used.", action="store")
parser.addArgument("-f", "--fasta", type=str, help="Path to the FASTA file to load.", action="store")
parser.addArgument("-r", "--roll", type=str, help="Path to the Roll MD file.", action="store")
parser.addArgument("-R", "--rise", type=str, help="Path to the Rise MD file.", action="store")
parser.addArgument("-s", "--slide", type=str, help="Path to the Slide MD file.", action="store")
parser.addArgument("-S", "--shift", type=str, help="Path to the Shift MD file.", action="store")
parser.addArgument("-t", "--twist", type=str, help="Path to the Twist MD file.", action="store")
parser.addArgument("-T", "--tilt", type=str, help="Path to the Tilt MD file.", action="store")
parser.add_argument("-a", "--alphabet", type=str, help="Alphabet (DNA, RNA, prot): Unambiguous IUPAC alphabet that will be used.", action="store")
parser.add_argument("-A", "--alpha", type=float, help="Alpha parameter for statistic analysis", action="store")
parser.add_argument("-c", "--centering", type=int, help="Centering parameter", action="store")
parser.add_argument("-g", "--graph", type=int, help="Plot graphs................. There, it's done.", action="store")
parser.add_argument("-s", "--sliding", type=int, help="Sl______iding parameter", action="store")
parser.add_argument("-t", "--trx-scale-file", type=str, help="Path to trx scale file", action="store")
parser.add_argument("datadir", type=str, help="Path to the data directory.", action="store")
# Then, parse it
args = parse.parse_args()
if not args.fasta:
raise EnvironmentError("You have to give a FASTA file.")
# Blah
# seq = Sequence(args.alphabet, args.fasta, args.roll, args.rise, args.slide, ...)
args = parser.parse_args()
if not args.datadir:
raise EnvironmentError("You have to give a data directory.")
......@@ -6,9 +6,9 @@ Docstring
"""
import lib.file_tools as file_tools
import lib.fft as fft_lib
import lib.fft_tools as fft_lib
import lib.correlation as correlation_lib
import trx
import lib.trx as trx_lib
class Sequence(object):
# XXX
......@@ -16,7 +16,7 @@ class Sequence(object):
Sequence class
"""
def __init__(self, alphabet, fasta, md_parameters, trx_scale_path=trx.scale_file, sliding=72, centering=72, alpha=0.05, graph=None):
def __init__(self, fasta, md_parameters, group="", alphabet="dna", trx_scale_path=trx_lib.scale_file, sliding=72, centering=72, alpha=0.05, graph=None):
# XXX
"""
......@@ -31,6 +31,7 @@ class Sequence(object):
self.sliding = sliding
self.alpha = alpha
self.centering = centering
self.group = group
self.graph = graph
self.trx_scale_path = trx_scale_path
......@@ -48,6 +49,27 @@ class Sequence(object):
self.load_md(md_parameters)
self.load_trx()
def get(self, name, default):
"""
Try to return self[name], if it fails, returns
default
"""
try:
return getattr(self, name)
except AttributeError:
return default
def __getitem__(self, name):
"""
When doing self["a"], will try to fetch self.a.
If it fails, will raise a KeyError, because it's what
dict["a"] is supposed to return if "a" is not a key.
"""
try:
return getattr(self, name)
except AttributeError:
raise KeyError('%r' % (name,))
def load_fasta(self, fasta):
"""
Loads a sequence from a fasta file.
......@@ -82,30 +104,24 @@ class Sequence(object):
_md_ffts_s = [fft_lib.sliding_fft(frame, window=self.sliding) for frame in _md_params[helicoidal_parameter]]
# Toto
good_values = [fft_lib.get_noticeable_data(fft) for fft in _md_ffts]
good_values = [fft_lib.get_noticeable_data(fft) for fft in _md_ffts]
good_values_c = [fft_lib.get_noticeable_data(fft) for fft in _md_ffts_c]
good_values_s = [fft_lib.get_noticeable_data(fft) for fft in _md_ffts_s]
# _md_ffts[i][0] : frame number
self.md[helicoidal_parameter] = {
_md_ffts[i][0] : {
'complete' : {
'peak_freq' : good_values[i][0],
'peak' : good_values[i][1],
'size' : good_values[i][2],
},
'center' : {
'peak_freq' : good_values_c[i][0],
'peak' : good_values_c[i][1],
'size' : good_values_c[i][2],
},
'sliding' : {
'peak_freq' : good_values_s[i][0],
'peak' : good_values_s[i][1],
'size' : good_values_s[i][2],
},
} for i in xrange(0, len(_md_ffts))
}
_md_ffts[i][0] : {
'complete_peak_freq' : good_values[i][0],
'complete_peak' : good_values[i][1],
'complete_size' : good_values[i][2],
'center_peak_freq' : good_values_c[i][0],
'center_peak' : good_values_c[i][1],
'center_size' : good_values_c[i][2],
'sliding_peak_freq' : good_values_s[i][0],
'sliding_peak' : good_values_s[i][1],
'sliding_size' : good_values_s[i][2],
} for i in xrange(0, len(_md_ffts))
}
if self.graph is not None:
fft_lib.plot(_md_ffts, _md_ffts_c, _md_ffts_s, self.graph, helicoidal_parameter)
......@@ -131,7 +147,7 @@ class Sequence(object):
Loads a Yasara MD file from a MD Yasara data file
"""
trx_dict = trx.match(self.sequence, trx.tools.parse_trx_scale(self.trx_scale_path), self.centering)
trx_dict = trx_lib.match(self.sequence, trx_lib.parse_trx_scale(self.trx_scale_path), self.centering)
fft = fft_lib.fft(trx_dict)
fft_c = fft_lib.fft(trx_dict, centering=self.centering)
......@@ -142,19 +158,13 @@ class Sequence(object):
good_values_s = fft_lib.get_noticeable_data(fft_s)
self.trx = {
'complete' : {
'peak_freq' : good_values[i][0],
'peak' : good_values[i][1],
'size' : good_values[i][2],
},
'center' : {
'peak_freq' : good_values_c[i][0],
'peak' : good_values_c[i][1],
'size' : good_values_c[i][2],
},
'sliding' : {
'peak_freq' : good_values_s[i][0],
'peak' : good_values_s[i][1],
'size' : good_values_s[i][2],
},
}
'complete_peak_freq' : good_values[0],
'complete_peak' : good_values[1],
'complete_size' : good_values[2],
'center_peak_freq' : good_values_c[0],
'center_peak' : good_values_c[1],
'center_size' : good_values_c[2],
'sliding_peak_freq' : good_values_s[0],
'sliding_peak' : good_values_s[1],
'sliding_size' : good_values_s[2],
}
#!/bin/bash
set -x
sudo -u postgres psql -d postgres -c "CREATE ROLE $1 LOGIN";
sudo -u postgres createdb itpp -O $1;
sudo -u $1 psql -d itpp -f ./create_tables.sql
CREATE TABLE "sequences" (
id SERIAL,
"an" CHARACTER VARYING(20),
"name" CHARACTER VARYING(20) NOT NULL,
"description" TEXT NOT NULL,
"sequence" TEXT NOT NULL,
"group" CHARACTER VARYING(50),
"alphabet" CHARACTER VARYING(5) NOT NULL,
"alpha" REAL NOT NULL,
"sliding" SMALLINT NOT NULL,
"centering" SMALLINT NOT NULL,
"trx_scale_path" CHARACTER VARYING(500) NOT NULL,
CONSTRAINT "sequences_pkey" PRIMARY KEY("id")
) WITHOUT OIDS;
CREATE TABLE "md_rise" (
"seq_id" INTEGER NOT NULL REFERENCES "sequences",
"frame_num" SMALLINT NOT NULL,
"complete_peak_freq" REAL NOT NULL,
"complete_peak" REAL NOT NULL,
"complete_size" REAL NOT NULL,
"center_peak_freq" REAL NOT NULL,
"center_peak" REAL NOT NULL,
"center_size" REAL NOT NULL,
"sliding_peak_freq" REAL NOT NULL,
"sliding_peak" REAL NOT NULL,
"sliding_size" REAL NOT NULL
);
CREATE TABLE "md_roll" (
"seq_id" INTEGER NOT NULL REFERENCES "sequences",
"frame_num" SMALLINT NOT NULL,
"complete_peak_freq" REAL NOT NULL,
"complete_peak" REAL NOT NULL,
"complete_size" REAL NOT NULL,
"center_peak_freq" REAL NOT NULL,
"center_peak" REAL NOT NULL,
"center_size" REAL NOT NULL,
"sliding_peak_freq" REAL NOT NULL,
"sliding_peak" REAL NOT NULL,
"sliding_size" REAL NOT NULL
);
CREATE TABLE "md_shift" (
"seq_id" INTEGER NOT NULL REFERENCES "sequences",
"frame_num" SMALLINT NOT NULL,
"complete_peak_freq" REAL NOT NULL,
"complete_peak" REAL NOT NULL,
"complete_size" REAL NOT NULL,
"center_peak_freq" REAL NOT NULL,
"center_peak" REAL NOT NULL,
"center_size" REAL NOT NULL,
"sliding_peak_freq" REAL NOT NULL,
"sliding_peak" REAL NOT NULL,
"sliding_size" REAL NOT NULL
);