Commit 2657000f authored by Pauline Pommeret's avatar Pauline Pommeret
Browse files

[database] Tune up and a bit of doc.

parent d396dc2b
#!/usr/bin/env python2.7
#
# Copyright (C) 2014 Pauline Pommeret <pommeret@crans.org>
# Authors : Pauline Pommeret <pommeret@crans.org>,
# Jonas Sénizergues <senizergues@crans.org>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Cr@ns nor the names of its contributors may
# be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
# HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Delete a sequence from database.
"""
import argparse
from argparse import RawDescriptionHelpFormatter
import lib.database
if __name__ == "__main__":
DBCURSOR = lib.database.XylokPGCursor()
# -*- Parsing command-line arguments using argparse -*- #
# TEXT is a string that holds the description of the program
TEXT = """Delete sequence from database."""
# Creates the parser
PARSER = argparse.ArgumentParser(description=TEXT, formatter_class=RawDescriptionHelpFormatter)
# Fills the parser with the program arguments
PARSER.add_argument("seqid", type=int, help="The id of the sequence to remove.", action="store")
# Parses the arguments
ARGS = PARSER.parse_args()
# Prints something
if DBCURSOR.remove_seq(ARGS.seqid) >= 1:
print "Sequence %s has been successfully removed." % (ARGS.seqid,)
else:
print "Sequence %s does not exist." %(ARGS.seqid,)
......@@ -114,10 +114,10 @@ if __name__ == "__main__":
# Creates a Sequence object
seq = sequence.Sequence(fasta_file, md_parameters, alphabet=ARGS.alphabet, trx_scale_path=ARGS.trx_scale_file, sliding=ARGS.sliding, centering=ARGS.centering, alpha=ARGS.alpha, graph=ARGS.graph)
seq_id = DBCURSOR.check_sequence(seq)
seq_id, label = DBCURSOR.check_sequence(seq)
if seq_id:
print "This sequence is already in database (id=%s)." % (seq_id,)
sys.exit(0)
print "This sequence is already in database (id=%s, label=%s)." % (seq_id, label)
sys.exit(42)
# We have to build the DT only if we are certain that the sequence is
# not in the db, as it could alter the result of the DT.
......@@ -131,7 +131,9 @@ if __name__ == "__main__":
seq_id = DBCURSOR.store_sequence(seq)
# Done, moving on to next directory
print "Data stored in %r added to database under id %s" % (cur_dir, seq_id)
print "Data stored in %s added to database under id %s" % (cur_dir, seq_id)
data = DBCURSOR.fetch_averaged_specific_sequence(seq_id)
print PA[DT.predict(data)[0]]
label = PA[DT.predict(data)[0]]
print "Guessed label for sequence %s : %s" % (cur_dir, label)
DBCURSOR.label_sequence(seq_id, label)
......@@ -57,12 +57,148 @@ class XylokPGCursor(object):
Returns:
- nothing
"""
self._conn = psycopg2.connect(database='itpp')
self._cur = self._conn.cursor()
self._conn.set_session(autocommit=True)
self._cur = self._conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
def check_sequence(self, seq):
"""
Checks if seq is already recorded in database.
Parameters:
- ``self``
- ``seq`` : sequence object (:py:mod:`sequence`)
Returns:
- int
"""
verif_query = """SELECT
id, label
FROM
sequences
WHERE
sequence=%s;"""
self._cur.execute(verif_query, (seq.sequence,))
ret = self._cur.fetchone()
if ret:
return ret[0], ret[1]
else:
return 0, None
def store_sequence(self, seq):
"""
Stores the sequence data in database
Parameters:
- ``self``
- ``seq`` : sequence object (:py:mod:`sequence`)
Returns:
- nothing
"""
# This query is designed to add the sequence metadata in the database.
# We give explicitly the columns of the table we want to fill, and we also give the dict keys for the VALUES,
# so we only have to give seq to cur.execute() as if seq was a dict.
main_seq_query = """INSERT INTO
sequences
(accession, name, description, sequence, label, alphabet, alpha, sliding, centering, trx_scale_path)
VALUES
(%(accession)s, %(name)s, %(description)s, %(sequence)s, %(label)s, %(alphabet)s,\
%(alpha)s, %(sliding)s, %(centering)s, %(trx_scale_path)s)
RETURNING
id;"""
self._cur.execute(main_seq_query, seq)
# The query returns the id of the new recorded data. Will be useful for the other queries.
seq_id = self._cur.fetchone()[0]
# We do a record for each helicoidal param, recording all the frames in the good table.
for helicoidal_parameter in seq.mdd:
for (frame_num, values) in seq.mdd[helicoidal_parameter].iteritems():
# this query is formatted twice, once for data outside from value dict and for the table name,
# once again for the dict value itself. This is why there is %%(attr)s instead of %(attr)s.
md_query = """INSERT INTO
md_%s
(seq_id, frame_num, complete_peak_freq, complete_peak, complete_size, center_peak_freq,\
center_peak, center_size, sliding_peak_freq, sliding_peak, sliding_size)
VALUES
(%s, %s, %%(complete_peak_freq)s, %%(complete_peak)s, %%(complete_size)s, %%(center_peak_freq)s,\
%%(center_peak)s, %%(center_size)s, %%(sliding_peak_freq)s, %%(sliding_peak)s,\
%%(sliding_size)s);""" % (helicoidal_parameter, seq_id, frame_num)
# We execute the query.
self._cur.execute(md_query, values)
# Trx data.
trx_query = """INSERT INTO
trx
(seq_id, complete_peak_freq, complete_peak, complete_size, center_peak_freq,\
center_peak, center_size, sliding_peak_freq, sliding_peak, sliding_size)
VALUES
(%(seq_id)s, %(complete_peak_freq)s, %(complete_peak)s, %(complete_size)s,\
%(center_peak_freq)s, %(center_peak)s, %(center_size)s, %(sliding_peak_freq)s, %(sliding_peak)s, %(sliding_size)s);"""
dic_to_sql = dict(seq.trx)
dic_to_sql.update({'seq_id': seq_id})
self._cur.execute(trx_query, dic_to_sql)
#Correlation data
for (correl_types, bunch_of_data) in seq.correlation.iteritems():
type_a, type_b = correl_types.split("/")
for (frame_num, data) in bunch_of_data.iteritems():
corr_query = """INSERT INTO
correlations
(seq_id, frame_num, type_a, type_b, spearman_complete, spearman_center, pearson_complete, pearson_center)
VALUES
(%(seq_id)s, %(frame_num)s, %(type_a)s, %(type_b)s, %(spearman_complete)s,\
%(spearman_center)s, %(pearson_complete)s, %(pearson_center)s);"""
dic_to_sql = dict(data)
dic_to_sql.update({
"frame_num": frame_num,
"seq_id": seq_id,
"type_a": type_a,
"type_b": type_b,
})
self._cur.execute(corr_query, dic_to_sql)
return seq_id
def remove_seq(self, seq_id):
"""
Removes a sequence from database
Parameters:
- ``self``
- ``seq_id`` : sequence id (int)
Returns:
- nothing
"""
for md_type in ["rise", "roll", "shift", "slide", "tilt", "twist"]:
self._cur.execute("DELETE FROM md_%s WHERE seq_id=%s" % (md_type, seq_id))
self._cur.execute('DELETE FROM trx WHERE seq_id=%s', (seq_id,))
self._cur.execute('DELETE FROM correlations WHERE seq_id=%s', (seq_id,))
self._cur.execute('DELETE FROM sequences WHERE id=%s', (seq_id,))
return self._cur.rowcount
def fetch_averaged_specific_sequence(self, seq_id):
# XXX meh.
"""
Retreive averaged data about a sequence.
Parameters:
- ``self``
- ``seq_id`` : sequence id (int)
Returns:
- ``data_list`` : a list of floats, which describes the sequence
for the :py:mod:`sklearn.tree.DesicionTree`.
"""
data_list = []
data_list.extend(self.fetch_trx_data(seq_id))
data_list.extend(self.fetch_averaged_md_data(seq_id))
......@@ -232,129 +368,23 @@ WHERE
self._cur.execute(trx_query)
return list(self._cur.fetchone())
class PGCursor(object):
"""
PostGreSQL Python Cursor.
"""
def __init__(self):
def label_sequence(self, seq_id, label):
"""
Starts a psycopg2 cursor and populates:
- ``self._conn``
- ``self._cur``
Label the selected sequence
Parameters:
- ``self``
- ``seq_id`` : sequence id (int)
- ``label`` : the label for sequence (str)
Returns:
- nothing
"""
self._conn = psycopg2.connect(database='itpp')
self._conn.set_session(autocommit=True)
self._cur = self._conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
def check_sequence(self, seq):
"""
Checks if seq is already recorded in database.
"""
verif_query = """SELECT
id
FROM
update_query = """UPDATE
sequences
SET
label = %s
WHERE
sequence=%s;"""
self._cur.execute(verif_query, (seq.sequence,))
ret = self._cur.fetchone()
if ret:
return ret[0]
else:
return False
def store_sequence(self, seq):
"""
Stores the sequence data in database
Parameters:
- ``self``
- ``seq`` : sequence object (:py:mod:`sequence`)
Returns:
- nothing
"""
# This query is designed to add the sequence metadata in the database.
# We give explicitly the columns of the table we want to fill, and we also give the dict keys for the VALUES,
# so we only have to give seq to cur.execute() as if seq was a dict.
main_seq_query = """INSERT INTO
sequences
(accession, name, description, sequence, label, alphabet, alpha, sliding, centering, trx_scale_path)
VALUES
(%(accession)s, %(name)s, %(description)s, %(sequence)s, %(label)s, %(alphabet)s,\
%(alpha)s, %(sliding)s, %(centering)s, %(trx_scale_path)s)
RETURNING
id;"""
self._cur.execute(main_seq_query, seq)
# The query returns the id of the new recorded data. Will be useful for the other queries.
seq_id = self._cur.fetchone()[0]
# We do a record for each helicoidal param, recording all the frames in the good table.
for helicoidal_parameter in seq.mdd:
for (frame_num, values) in seq.mdd[helicoidal_parameter].iteritems():
# this query is formatted twice, once for data outside from value dict and for the table name,
# once again for the dict value itself. This is why there is %%(attr)s instead of %(attr)s.
md_query = """INSERT INTO
md_%s
(seq_id, frame_num, complete_peak_freq, complete_peak, complete_size, center_peak_freq,\
center_peak, center_size, sliding_peak_freq, sliding_peak, sliding_size)
VALUES
(%s, %s, %%(complete_peak_freq)s, %%(complete_peak)s, %%(complete_size)s, %%(center_peak_freq)s,\
%%(center_peak)s, %%(center_size)s, %%(sliding_peak_freq)s, %%(sliding_peak)s,\
%%(sliding_size)s);""" % (helicoidal_parameter, seq_id, frame_num)
# We execute the query.
self._cur.execute(md_query, values)
# Trx data.
trx_query = """INSERT INTO
trx
(seq_id, complete_peak_freq, complete_peak, complete_size, center_peak_freq,\
center_peak, center_size, sliding_peak_freq, sliding_peak, sliding_size)
VALUES
(%(seq_id)s, %(complete_peak_freq)s, %(complete_peak)s, %(complete_size)s,\
%(center_peak_freq)s, %(center_peak)s, %(center_size)s, %(sliding_peak_freq)s, %(sliding_peak)s, %(sliding_size)s);"""
dic_to_sql = dict(seq.trx)
dic_to_sql.update({'seq_id': seq_id})
self._cur.execute(trx_query, dic_to_sql)
#Correlation data
for (correl_types, bunch_of_data) in seq.correlation.iteritems():
type_a, type_b = correl_types.split("/")
for (frame_num, data) in bunch_of_data.iteritems():
corr_query = """INSERT INTO
correlations
(seq_id, frame_num, type_a, type_b, spearman_complete, spearman_center, pearson_complete, pearson_center)
VALUES
(%(seq_id)s, %(frame_num)s, %(type_a)s, %(type_b)s, %(spearman_complete)s,\
%(spearman_center)s, %(pearson_complete)s, %(pearson_center)s);"""
dic_to_sql = dict(data)
dic_to_sql.update({
"frame_num": frame_num,
"seq_id": seq_id,
"type_a": type_a,
"type_b": type_b,
})
self._cur.execute(corr_query, dic_to_sql)
return seq_id
def remove_seq(self, seq_id):
"""
Removes a sequence from database
"""
for md_type in ["rise", "roll", "shift", "slide", "tilt", "twist"]:
self._cur.execute("DELETE FROM md_%s WHERE seq_id=%s" % (md_type, seq_id))
self._cur.execute('DELETE FROM trx WHERE seq_id=%s', (seq_id,))
self._cur.execute('DELETE FROM correlations WHERE seq_id=%s', (seq_id,))
self._cur.execute('DELETE FROM sequences WHERE id=%s', (seq_id,))
id = %s;"""
self._cur.execute(update_query, (label, seq_id))
......@@ -145,7 +145,7 @@ A .dat file must me organized this way:
# Creates a Sequence object
seq = sequence.Sequence(fasta_file, md_parameters, label, alphabet=ARGS.alphabet, trx_scale_path=ARGS.trx_scale_file, sliding=ARGS.sliding, centering=ARGS.centering, alpha=ARGS.alpha, graph=ARGS.graph)
seq_id = DBCURSOR.check_sequence(seq)
seq_id, label = DBCURSOR.check_sequence(seq)
if seq_id:
if raw_input("This sequence is already in database. Would you like to recompute its data? [y/n]") != "y":
continue
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment