From 08f7fe9dd29aa12c8e3a83d5c8f9b5b6364562f7 Mon Sep 17 00:00:00 2001 From: Pauline Pommeret Date: Wed, 3 Dec 2014 21:26:24 +0100 Subject: [PATCH] Sphinxisation of docstrings --- lib/XylokExceptions.py | 2 +- lib/file_tools.py | 39 +++++++++++++++++++++++---------------- lib/learning.py | 40 +++++++++++++++++++++++++++++++++++++++- populate_db.py | 10 +++++++--- sequence.py | 23 +++++++++++------------ test_sequence.py | 31 ++++++++++++++++++++++--------- 6 files changed, 103 insertions(+), 42 deletions(-) diff --git a/lib/XylokExceptions.py b/lib/XylokExceptions.py index 1a2e8d7..9dba83b 100644 --- a/lib/XylokExceptions.py +++ b/lib/XylokExceptions.py @@ -1,6 +1,6 @@ #!/usr/bin/env python2.7 """ -Definition of the specific errors of Xylok. +This file holds the definitions of Xylok's specific errors. All of them inherit :py:class:`XylokError`. """ diff --git a/lib/file_tools.py b/lib/file_tools.py index 693f4dc..d6499e0 100644 --- a/lib/file_tools.py +++ b/lib/file_tools.py @@ -1,11 +1,20 @@ #!/usr/bin/env python2.7 """ -This module contains functions required to load Xylok date from files. +This file contains the functions required to load Xylok data from files. -Requires: - - :py:mod:`re` - - :py:mod:`Bio.SeqIO` - - :py:mod:`Bio.Alphabet.IUPAC` +.. note:: + MD file must look like: + frame C16/C17 C17/G18 G18/A19 ... + 0 -1.0 9.0 -1.8 + 1 -4.4 -7.6 -0.1 + 2 -6.4 1.0 -6.0 + 3 -3.6 -1.1 -2.5 + +.. seealso:: + Standard librairies: + * :py:mod:`re` + * :py:mod:`Bio.SeqIO` + * :py:mod:`Bio.Alphabet.IUPAC` """ import re @@ -14,17 +23,15 @@ from Bio.Alphabet import IUPAC def load_fasta(sequence_alphabet, path): """ - Loads a fasta file from a file, using its alphabet and returns a record of - it. + Loads a fasta sequence from a file, using its alphabet and returns a record + with all the information in the file. Parameters: - ``sequence_alphabet`` : str (dna, rna, prot) - ``path`` : path to FASTA file - Uses: - :py:mod:`Bio.SeqIO` - :py:mod:`Bio.Alphabet` - Returns: - an object of type :py:class:`Bio.SeqRecord.SeqRecord` """ @@ -42,16 +49,16 @@ def load_md_data(path): Loads a post-processed MD file into a list of dictionaries (each frame has its dictionary). - MD file must look like: - frame C16/C17 C17/G18 G18/A19 ... - 0 -1.0 9.0 -1.8 - 1 -4.4 -7.6 -0.1 - 2 -6.4 1.0 -6.0 - 3 -3.6 -1.1 -2.5 + .. note:: + MD file must look like: + frame C16/C17 C17/G18 G18/A19 ... + 0 -1.0 9.0 -1.8 + 1 -4.4 -7.6 -0.1 + 2 -6.4 1.0 -6.0 + 3 -3.6 -1.1 -2.5 Parameters: - ``path`` : str, path to the file - Returns: - a list of dictionaries (each dictionary is a MD frame) [ ... {"frame": frame#, ..., position_i: value_i, ...} ...] diff --git a/lib/learning.py b/lib/learning.py index 2cb4d4b..ee57b2b 100644 --- a/lib/learning.py +++ b/lib/learning.py @@ -1,4 +1,26 @@ #!/usr/bin/env python2.7 +""" +This file handles the learning part of Xylok. For now, only supervised +learning using the decision tree algorithm. + +The decision tree computed in an average decision tree: relevant values of Fast +Fourier Transform are averaged to appear as a single *mean frame* in order not +to unbalance the algorithm. + +.. note:: + It may deal with incomplete data such as no *Rise* or no *Tilt* + information, thanks to preprocessing. (However, the decision might not + be relevant, but this is a user issue.) + +.. seealso:: + Standard librairies: + * :py:mod:`sklearn` + * :py:mod:`sklearn.preprocessing` + * :py:mod:`sklearn.tree` + Xylok: + * :py:mod:`lib.database` +""" + import sklearn import sklearn.preprocessing @@ -6,19 +28,35 @@ import sklearn.tree import lib.database -LCURSOR = lib.database.LearningPGCursor() +# Cursor on PostgreSQL database that allows reading/writing data in it +LCURSOR = lib.database.XylokPGCursor() def generate_learner(): """ Returns an averaged DecisionTree + + Parameters: + - None + Uses: + - :py:mod:`sklearn` + - :py:mod:`sklearn.preprocessing + - :py:mod:`sklearn.tree` + Returns: + - decision tree and a list of all possible labels (str) """ + # Creates an imputer that decides what the missing values are in data imp = sklearn.preprocessing.Imputer(missing_values=0.0, strategy="mean", verbose=0, copy=False, axis=1) + + # Retrieves data, list of numbers (translation of labels) and a list of + # all the possible lables (str) from the PostgreSQL datas, answers, possible_answers = LCURSOR.fetch_averaged_sequence_data() + # The imputer does it's job imp.fit(datas) datas = imp.transform(datas) + # Creates the decision tree dtree = sklearn.tree.DecisionTreeClassifier() dtree.fit(datas, answers) diff --git a/populate_db.py b/populate_db.py index cd9f011..5d9b37c 100755 --- a/populate_db.py +++ b/populate_db.py @@ -1,9 +1,13 @@ #!/usr/bin/env python2.7 """ +This program populates a pre-existing PostgreSQL database with the results. +.. warning:: + This programe requires a PostgreSQL database in order to work properly! + Please run :py:data:`sql.create_db.sh` with your PAM username as an + argument or create a PostgreSQL using the bdd schema stored in + :py:data:`sdl.create_db.sql`. -This program populates a pre-existing PostgreSQL database with the results. -A shell Authors: Pauline Pommeret, Jonas Senizergues @@ -27,7 +31,7 @@ import lib.trx as trx_lib import lib.database as database if __name__ == "__main__": - DBCURSOR = database.PGCursor() + DBCURSOR = database.XylokPGCursor() # -*- Parsing command-line arguments using argparse -*- # diff --git a/sequence.py b/sequence.py index f55c257..81a5741 100644 --- a/sequence.py +++ b/sequence.py @@ -104,7 +104,17 @@ class Sequence(object): def do_analysis(self): """ - Compute complex data from parameters. + Runs function and triggers the actual job. + *(Those functions are called in a specific function so that they are + called only if the data isn't already in the database)* + + Parameters: + - ``self`` + Uses: + - :py:meth:`load_md` + - :py:meth:`load_trx` + Returns: + - nothing """ self.load_md() self.load_trx() @@ -117,10 +127,8 @@ class Sequence(object): - ``self`` - ``name`` - ``default`` - Returns: - a string - May raise: - AttributeError """ @@ -138,10 +146,8 @@ class Sequence(object): Parameters: - ``self`` - ``name`` - Returns: - self["a"] - May raise: - KeyError """ @@ -161,10 +167,8 @@ class Sequence(object): Parameters: - ``self`` - ``fasta`` : str, path to the considered fasta file - Uses: - :py:meth:`lib.file_tools:load_fasta` - Returns: - nothing """ @@ -183,16 +187,13 @@ class Sequence(object): Parameters: - ``self`` - Uses: - :py:meth:`lib.file_tools:load_md_data` - :py:meth:`lib.fft_tools:fft` - :py:meth:`lib.fft_tools:get_noticeable_data` - :py:meth:`lib.correlation:compute_correlations` - Returns: - nothing - May raise: - NotImplementedError """ @@ -267,14 +268,12 @@ class Sequence(object): Parameters: - ``self`` - Uses: - :py:meth:`lib.trx:match` - :py:meth:`lib.trx:parse_trx_scale` - :py:meth:`lib.fft_tools:fft` - :py:meth:`lib.fft_tools:sliding_fft` - :py:meth:`lib.fft_tools:get_noticeable_data` - Returns: - nothing """ diff --git a/test_sequence.py b/test_sequence.py index 50ea927..085ec27 100755 --- a/test_sequence.py +++ b/test_sequence.py @@ -1,5 +1,23 @@ #!/usr/bin/env python2.7 # XXX Bleh +""" +Generates the decision tree associated if the + +si deja bdd : propose de la tej et remplacer +sinon: generer DT +Generates the decision tree associated if the +test sequence + + + +.. warning:: + Requires a populated PostgreSQL database in order to run properly. + +.. seealso:: modules :py:mod:`lib.learning` :py:mod:`lib.database` +:py:mod:`lib.trx` :py:mod:`sequence` + +""" + import argparse from argparse import RawDescriptionHelpFormatter @@ -13,9 +31,7 @@ import sequence if __name__ == "__main__": - DT, PA = lib.learning.generate_learner() - LC = lib.database.LearningPGCursor() - DBCURSOR = lib.database.PGCursor() + DBCURSOR = lib.database.XylokPGCursor() # -*- Parsing command-line arguments using argparse -*- # @@ -91,11 +107,8 @@ A .dat file must me organized this way: seq = sequence.Sequence(fasta_file, md_parameters, alphabet=ARGS.alphabet, trx_scale_path=ARGS.trx_scale_file, sliding=ARGS.sliding, centering=ARGS.centering, alpha=ARGS.alpha, graph=ARGS.graph) seq_id = DBCURSOR.check_sequence(seq) if seq_id: - if raw_input("This sequence is already in database. Would you like to recompute its data? [y/n]") != "y": - data = LC.fetch_averaged_specific_sequence(seq_id) - print PA[DT.predict(data)[0]] - sys.exit(0) - DBCURSOR.remove_seq(seq_id) + print "This sequence is already in database (id=%s)." % (seq_id,) + sys.exit(0) # We have to build the DT only if we are certain that the sequence is # not in the db, as it could alter the result of the DT. @@ -111,5 +124,5 @@ A .dat file must me organized this way: # Done, moving on to next directory print "Data stored in %r added to database under id %s" % (cur_dir, seq_id) - data = LC.fetch_averaged_specific_sequence(seq_id) + data = DBCURSOR.fetch_averaged_specific_sequence(seq_id) print PA[DT.predict(data)[0]] -- GitLab