#!/usr/bin/env python

import sys
import argparse
import threading
import os
from subprocess import call
from itertools import izip
from sshmm.structure_prediction import calculate_rna_shapes_from_file, calculate_rna_structures_from_file

#Class defining a thread to run secondary structure prediction
class FoldThread(threading.Thread):
     def __init__(self, script, fasta, profile):
         super(FoldThread, self).__init__()
         self.daemon = True
         self.script = script
         self.fasta = open(fasta, 'r')
         self.profile = open(profile, 'w')

     def run(self):
         call([self.script, '-W', '240', '-L', '160', '-u', '1'], stdin=self.fasta, stdout=self.profile)

def parseArguments(args):
    """Sets up the command-line parser and calls it on the command-line arguments to the program.

    arguments:
    args -- command-line arguments to the program"""
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description="""Prepare a CLIP-Seq dataset in BED format for the training of an ssHMM. The following preprocessing steps are taken:
1 - Filter (positive) BED file, 2 - Shuffle (positive) BED file to generate negative dataset, 3 - Enlongate positive and negative BED files for later structure prediction, 4 - Fetch genomic sequences for elongated BED files, 5 - Produce FASTA files with genomic sequences in viewpoint format, 6 - Calculate RNA shapes, 7 - Calculate RNA structures

A root directory for the datasets and a dataset name (e.g., the protein name) has to be given. The following files will be created in the root directory and its subdirectories:
- /bed/<dataset_name>/positive_raw.bed - positive BED file from CLIP-Seq experiment
- /bed/<dataset_name>/positive.bed - filtered positive BED file
- /bed/<dataset_name>/negative.bed - filtered negative BED file
- /bed/<dataset_name>/positive_long.bed - elongated positive BED file
- /bed/<dataset_name>/negative_long.bed - elongated negative BED file
- /temp/<dataset_name>/positive_long.fasta - genomic sequences of elongated positive BED file
- /temp/<dataset_name>/negative_long.fasta - genomic sequences of elongated negative BED file
- /fasta/<dataset_name>/positive.fasta - positive genomic sequences in viewpoint format
- /fasta/<dataset_name>/negative.fasta - negative genomic sequences in viewpoint format
- /shapes/<dataset_name>/positive.txt - secondary structures of positive genomic sequence (predicted by RNAshapes)
- /shapes/<dataset_name>/negative.txt - secondary structures of negative genomic sequence (predicted by RNAshapes)
- /structures/<dataset_name>/positive.txt - secondary structures of positive genomic sequence (predicted by RNAstructures)
- /structures/<dataset_name>/negative.txt - secondary structures of negative genomic sequence (predicted by RNAstructures)

The preprocessing step to begin with can be chosen. For each step, the files generated by the previous step need to be present. To execute all steps, only the positive_raw.bed must be present.

For the filtering step, the minimum score and binding site lengths can be defined with parameters. To fetch genomic sequences in step 4, the following file must be present in the genomes/ subdirectory:

- /genomes/[version]/[version].genome -> BED file defining the size of the chromosomes
- /genomes/[version]/UCSCGenesTrack.bed -> BED file defining the gene intervals
- /genomes/[version]/[version].fa -> FASTA file containing the human genome

The version of the genome can be given as an optional parameter. It defaults to 'hg19'. The files for the genomes/ directory can be obtained from UCSC:

/genomes/[version]/[version].genome:
-> download from http://hgdownload.soe.ucsc.edu/downloads.html#human (Full data set), e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes

/genomes/[version]/UCSCGenesTrack.bed:
-> download 'GENCODE' track in table browser (http://genome.ucsc.edu/cgi-bin/hgTables?hgsid=448474771_vbAiqf22i9Krk5unDxjcdLiigBW1); choose 'knownGene' and 'BED' as output format

/genomes/[version]/[version].fa:
-> download chromosomes from http://hgdownload.soe.ucsc.edu/downloads.html; e.g. wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/chromosomes/*'; concatenate chromosomes with cat and print into .fa file (e.g. with 'zcat chr* > hg19.fa')
""")
    parser.add_argument('directory', type=str, help='root directory for data')
    parser.add_argument('dataset_name', type=str, help='dataset name')
    parser.add_argument('jump_to', type=int, default=1, help='preprocessing step to jump to (as integer): '
                                                             '1 - filter bed, 2 - shuffle bed, '
                                                             '3 - enlongate bed, 4 - fetch sequences, 5 - format FASTA, '
                                                             '6 - calculate RNA shapes, 7 - calculate RNA structures')
    parser.add_argument('min_score', type=float, default=0.0, help='minimum score for binding site (default: 0.0)')
    parser.add_argument('--genome', type=str, default='hg19', help='genome version to use (default: hg19)')
    parser.add_argument('--min_length', type=int, default=8, help='minimum binding site length (default: 8)')
    parser.add_argument('--max_length', type=int, default=75, help='maximum binding site length (default: 75)')
    return parser.parse_args()

def prepareFolderStructure(directory, proteinName):
    if not os.path.exists(directory):
        return False
    if not os.path.exists(directory + '/bed/' + proteinName):
        os.makedirs(directory + '/bed/' + proteinName)
    if not os.path.exists(directory + '/fasta/' + proteinName):
        os.makedirs(directory + '/fasta/' + proteinName)
    if not os.path.exists(directory + '/temp/' + proteinName):
        os.makedirs(directory + '/temp/' + proteinName)
    if not os.path.exists(directory + '/shapes/' + proteinName):
        os.makedirs(directory + '/shapes/' + proteinName)
    if not os.path.exists(directory + '/structures/' + proteinName):
        os.makedirs(directory + '/structures/' + proteinName)
    return True

def filteringBed(filteredBedFileName, rawBedFileName, lowest_score, min_length, max_length):
    print>>sys.stderr, '-->Filtering', rawBedFileName
    filteredBedFile = open(filteredBedFileName, 'w')
    #filter for max length, min length, and min score
    call(['awk', '{{if ($3-$2 <= {0} && $3-$2 >= {1} && $5 >= {2}) {{print $0}}}}'.format(max_length, min_length, lowest_score), rawBedFileName], stdout=filteredBedFile)
    filteredBedFile.close()
    print>>sys.stderr, 'Written', filteredBedFileName

def shuffleBed(bedPositiveFileName, bedHg19FileName, bedGenesFileName, bedNegativeFileName):
    print>>sys.stderr, '-->Shuffeling', bedPositiveFileName    
    bedNegativeFile = open(bedNegativeFileName, 'w')
    call(['bedtools', 'shuffle', '-i', bedPositiveFileName, '-g', bedHg19FileName, '-incl', bedGenesFileName], stdout=bedNegativeFile)
    bedNegativeFile.close()
    print>>sys.stderr, 'Written', bedNegativeFileName

def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName):
    bedIntervalLongFile = open(bedIntervalLongFileName, 'w')
    print>>sys.stderr, '-->Elongating', bedIntervalFileName
    call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', '20'], stdout=bedIntervalLongFile)
    bedIntervalLongFile.close()
    print>>sys.stderr, 'Written', bedIntervalLongFileName

def fetchingSequences(fastaHg19FileName, fastaTempFileName, bedIntervalLongFileName):
    print>>sys.stderr, '-->Fetching nucleotide sequence for', bedIntervalLongFileName
    call(['fastaFromBed', '-s', '-fi', fastaHg19FileName, '-bed', bedIntervalLongFileName, '-fo', fastaTempFileName])
    print>>sys.stderr, 'Written', fastaTempFileName

def formatFasta(fastaFormattedFileName, bedIntervalFileName, bedIntervalLongFileName, fastaTempFileName):
    fastaTempFile = open(fastaTempFileName, 'r')
    fastaFormattedFile = open(fastaFormattedFileName, 'w')
    print>>sys.stderr, '-->Format FASTA file', fastaTempFileName, 'into viewpoint format'
    with open(bedIntervalFileName, 'r') as bedIntervalFile, open(bedIntervalLongFileName, 'r') as bedIntervalLongFile: 
        for x, y in izip(bedIntervalFile, bedIntervalLongFile):
            intervalFields = x.strip().split('\t')
            intervalLongFields = y.strip().split('\t')
            before = int(intervalFields[1]) - int(intervalLongFields[1])
            viewpoint = int(intervalFields[2]) - int(intervalFields[1])
            after = int(intervalLongFields[2]) - int(intervalFields[2])

            header = fastaTempFile.readline().strip()
            sequence = fastaTempFile.readline().strip()
            if len(sequence) == before + viewpoint + after:
                formattedSequence = sequence[:before].lower() + sequence[before:before + viewpoint].upper() + sequence[before + viewpoint:].lower()
                print>>fastaFormattedFile, header
                print>>fastaFormattedFile, formattedSequence
            else:
                print>>sys.stderr, 'Error in formatFasta(): sequence length not identical with before + viewpoint + after.', before, viewpoint, after, len(sequence)
    fastaFormattedFile.close()
    fastaTempFile.close()
    print>>sys.stderr, 'Written', fastaFormattedFileName


def main(args):
    options = parseArguments(args)

    #prepare folder structure
    if not(prepareFolderStructure(options.directory, options.dataset_name)):
        print>>sys.stderr, 'ERROR: Given directory not found.'

    #filtering bed
    bedFileName = options.directory + '/bed/' + options.dataset_name + '/positive_raw.bed'
    bedPositiveFileName = options.directory + '/bed/' + options.dataset_name + '/positive.bed'
    if options.jump_to <= 1:
        filteringBed(bedPositiveFileName, bedFileName, options.min_score, options.min_length, options.max_length)

    #Genome paths
    bedHgFileName = '{0}/genomes/{1}/{1}.genome'.format(options.directory, options.genome)
    bedGenesFileName = '{0}/genomes/{1}/UCSCGenesTrack.bed'.format(options.directory, options.genome)
    fastaHgFileName = '{0}/genomes/{1}/{1}.fa'.format(options.directory, options.genome)

    #Shuffle positive bed to get negative bed
    bedNegativeFileName = options.directory + '/bed/' + options.dataset_name + '/negative.bed'
    if options.jump_to <= 2:
        shuffleBed(bedPositiveFileName, bedHgFileName, bedGenesFileName, bedNegativeFileName)

    #Elongate positive bed
    bedPositiveLongFileName = options.directory + '/bed/' + options.dataset_name + '/positive_long.bed'
    if options.jump_to <= 3:
        elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName)

    #Elongate negative bed
    bedNegativeLongFileName = options.directory + '/bed/' + options.dataset_name + '/negative_long.bed'
    if options.jump_to <= 3:
        elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName)

    #Fetch positive sequences
    fastaTempPositiveFileName = options.directory + '/temp/' + options.dataset_name + '/positive_long.fasta'
    if options.jump_to <= 4:
        fetchingSequences(fastaHgFileName, fastaTempPositiveFileName, bedPositiveLongFileName)

    #Fetch negative sequences
    fastaTempNegativeFileName = options.directory + '/temp/' + options.dataset_name + '/negative_long.fasta'
    if options.jump_to <= 4:
        fetchingSequences(fastaHgFileName, fastaTempNegativeFileName, bedNegativeLongFileName)

    #Format positive FASTA
    fastaPositiveFileName = options.directory + '/fasta/' + options.dataset_name + '/positive.fasta'
    if options.jump_to <= 5:
        formatFasta(fastaPositiveFileName, bedPositiveFileName, bedPositiveLongFileName, fastaTempPositiveFileName)

    #Format negative FASTA
    fastaNegativeFileName = options.directory + '/fasta/' + options.dataset_name + '/negative.fasta'
    if options.jump_to <= 5:
        formatFasta(fastaNegativeFileName, bedNegativeFileName, bedNegativeLongFileName, fastaTempNegativeFileName)

    #Calculate positive RNA shapes
    shapePositiveFileName = options.directory + '/shapes/' + options.dataset_name + '/positive.txt'
    if options.jump_to <= 6:
        calculate_rna_shapes_from_file(shapePositiveFileName, fastaPositiveFileName)

    #Calculate negative RNA shapes
    shapeNegativeFileName = options.directory + '/shapes/' + options.dataset_name + '/negative.txt'
    if options.jump_to <= 6:
        calculate_rna_shapes_from_file(shapeNegativeFileName, fastaNegativeFileName)

    #Calculate positive RNA structures
    structuresPositiveFileName = options.directory + '/structures/' + options.dataset_name + '/positive.txt'
    if options.jump_to <= 7:
        calculate_rna_structures_from_file(structuresPositiveFileName, fastaPositiveFileName)

    #Calculate negative RNA structures
    structuresNegativeFileName = options.directory + '/structures/' + options.dataset_name + '/negative.txt'
    if options.jump_to <= 7:
        calculate_rna_structures_from_file(structuresNegativeFileName, fastaNegativeFileName)

if __name__ == "__main__" :
    sys.exit(main(sys.argv))
