Commit 7dff8e46 authored by Pauline Pommeret's avatar Pauline Pommeret

Program that returns the query's learning label

parent 106514c2
#!/usr/bin/env python2.7
# XXX Bleh
import argparse
from argparse import RawDescriptionHelpFormatter
import os
import sys
import lib.learning
import lib.database
import lib.trx as trx_lib
import sequence
if __name__ == "__main__":
DT, PA = lib.learning.generate_learner()
LC = lib.database.LearningPGCursor()
DBCURSOR = lib.database.PGCursor()
# -*- Parsing command-line arguments using argparse -*- #
# TEXT is a string that holds the description of the program
TEXT = """This program creates a postgresql database that holds the following data:
- sequence information (name, accession number, sequence alphabet, description, etc)
- analysis parameters (Student table's alpha, number of bp in center, smooth window, etc)
- relevant information on the TRX signal FFT
- relevant information on the MD helicoidal parameters FFT
- Spearman and Pearson correlations between every helicoidal parameters for every frame of the MD
The data directory must be organized this way:
datadir
\\-dir1
\\-group (mandatory)
\\-sequence.fasta (mandatory)
\\-rise.dat
\\-roll.dat
\\-tilt.dat
\\-twist.dat
\\-shift.dat
\\-slide.dat
\\-...
A .dat file must me organized this way:
frame C16/C17 C17/G18 G18/A19 ...
0 -1.0 9.0 -1.8
1 -4.4 -7.6 -0.1
2 -6.4 1.0 -6.0
3 -3.6 -1.1 -2.5
"""
# Creates the parser
PARSER = argparse.ArgumentParser(description=TEXT, formatter_class=RawDescriptionHelpFormatter)
# Fills the parser with the program arguments
PARSER.add_argument("-a", "--alphabet", type=str, default="dna", help="[str] sequences alphabet (dna, rna, prot), currently only dna is implemented (default: 'dna')", action="store")
PARSER.add_argument("-A", "--alpha", type=float, default=0.05, help="[float] alpha parameter of the Student table that is to be used in the statistical analysis (default: 0.05)", action="store")
PARSER.add_argument("-c", "--centering", type=int, default=72, help="[int] number of bp that are to be considered as the center of the sequence (default: 72)", action="store")
PARSER.add_argument("-s", "--sliding", type=int, default=72, help="[int] number of bp that are to be included in the smoothing window (default: 72)", action="store")
PARSER.add_argument("-t", "--trx-scale-file", type=str, default=trx_lib.SCALE_FILE, help="[str] path to trx scale file (default: trx_lib.SCALE_FILE)", action="store")
PARSER.add_argument("-g", "--graph", type=int, default=None, help="[int] number of graphs that are to be plotted, currently not implemented (default: None)", action="store")
PARSER.add_argument("datadir", type=str, help="[str] path to the data directory", action="store")
# Parses the arguments
ARGS = PARSER.parse_args()
# Obviously, datadir is mandatory, if none is given, nothing can be done
if not ARGS.datadir:
raise EnvironmentError("You have to give a data directory.")
cur_dir = os.path.basename(os.path.normpath(ARGS.datadir))
# -*- Populating the database -*- #
md_parameters = {}
for filepath in os.listdir(ARGS.datadir):
if filepath == "sequence.fasta":
fasta_file = os.path.join(ARGS.datadir, filepath)
if ".dat" in filepath:
md_parameters[filepath.replace(".dat", '')] = os.path.join(ARGS.datadir, filepath)
# Fasta file is mandatory (a sequence is required)
if fasta_file is None:
raise NoFastaProvided("There is no fasta file in %r" % (cur_dir,))
# Tells the users that something is happening (some users do like that)
print "Processing %r" % (cur_dir,)
# Creates a Sequence object
seq = sequence.Sequence(fasta_file, md_parameters, alphabet=ARGS.alphabet, trx_scale_path=ARGS.trx_scale_file, sliding=ARGS.sliding, centering=ARGS.centering, alpha=ARGS.alpha, graph=ARGS.graph)
seq_id = DBCURSOR.check_sequence(seq)
if seq_id:
if raw_input("This sequence is already in database. Would you like to recompute its data? [y/n]") != "y":
data = LC.fetch_averaged_specific_sequence(seq_id)
print PA[DT.predict(data)[0]]
sys.exit(0)
DBCURSOR.remove_seq(seq_id)
# We have to build the DT only if we are certain that the sequence is
# not in the db, as it could alter the result of the DT.
DT, PA = lib.learning.generate_learner()
seq.do_analysis()
# Tells the users that something is happening (some users do like that)
print "Adding data stored in %r in the database" % (cur_dir,)
# Stores the sequence "seq" in the database
seq_id = DBCURSOR.store_sequence(seq)
# Done, moving on to next directory
print "Data stored in %r added to database under id %s" % (cur_dir, seq_id)
data = LC.fetch_averaged_specific_sequence(seq_id)
print PA[DT.predict(data)[0]]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment