#!/srv/sw/python/2.7.4/bin/python

###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################

__author__ = "Donovan Parks"
__copyright__ = "Copyright 2014"
__credits__ = ["Donovan Parks"]
__license__ = "GPL3"
__maintainer__ = "Donovan Parks"
__email__ = "donovan.parks@gmail.com"
__status__ = "Development"

import os
import sys
import tempfile
import argparse

from comparem.main import OptionsParser

from biolib.misc.custom_help_formatter import ChangeTempAction, CustomHelpFormatter

"""
*****************************************************************************
To do:
 - average nucleotide identity
 -- need to explore different methods for calculating this
 -- simplest would seem to be fragmenting a genome into fix sized
    windows and using blastn, but I know more recent papers have
    argued against this approach and have preferred using mummer

- many of the classes should migrate over to use the Parallel class
-- e.g., blast.py
*****************************************************************************
"""


def version():
    binDir = os.path.dirname(os.path.realpath(__file__))
    versionFile = open(os.path.join(binDir, '..', 'comparem', 'VERSION'))
    return versionFile.read().strip()


def print_help():
    print ''
    print '                ...::: CompareM v' + version() + ' :::...'''
    print '''\

    ani           -> [Not implemented] Calculate the ANI between genome pairs

    aai_wf -> Run call_genes, blast, and aai
     call_genes   -> Identify genes within genomes
     rblast       -> Perform reciprocal blast between genome pairs
     aai          -> Calculate the AAI between homologs in genome pairs

    gene_wf -> Run core, dispensable, and unique
     core         -> [Not implemented] Identify genes contained in all genomes
     dispensable  -> [Not implemented] Identify genes contained in more than one, but not all genomes
     unique       -> [Not implemented] Identify genes present in a single genome

    General statistics:
     genome_stats -> [Not implemented] Calculate statistics for each genome (e.g., GC, coding density)

    Usage patterns:
     aa_usage     -> Calculate amino acid usage within each genome
     codon_usage  -> Calculate codon usage within each genome
     kmer_usage   -> Calculate kmer usage within each genome
     stop_usage   -> Calculate stop codon usage within each genome

    Lateral gene transfer:
     lgt_di       -> Calculate dinuceotide (3rd,1st) usage of genes to identify putative LGT events
     lgt_codon    -> Calculate codon usage of genes to identify putative LGT events

    Plots:
     pcoa_plot    -> [Not implemented] Generate PCoA plot indicating relative similarity of genomes
     heatmap      -> Generate a heatmap of the similarities between genomes

  Use: comparem <command> -h for command specific help.

  Feature requests or bug reports can be sent to Donovan Parks (donovan.parks@gmail.com)
    or posted on GitHub (https://github.com/dparks1134/comparem).
    '''

if __name__ == '__main__':

    # initialize the options parser
    parser = argparse.ArgumentParser(add_help=False)
    subparsers = parser.add_subparsers(help="--", dest='subparser_name')

    # identify genes within genomes
    call_genes_parser = subparsers.add_parser('call_genes',
                                        formatter_class=CustomHelpFormatter,
                                        description='Identify genes within genomes.')
    call_genes_parser.add_argument('genome_dir', help="directory containing genomes")
    call_genes_parser.add_argument('output_dir', help="output directory")
    call_genes_parser.add_argument('--genome_ext', default='fna', help="extension of genomes (other files in directory are ignored)")
    call_genes_parser.add_argument('--force_table', type=int, default=None, help="force use of specific translation table")
    call_genes_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # run reciprocal blast between genome pairs
    rblast_parser = subparsers.add_parser('rblast',
                                        formatter_class=CustomHelpFormatter,
                                        description='Perform reciprocal blast between genome pairs.')
    rblast_parser.add_argument('protein_dir', help="directory containing amino acid genes in fasta format")
    rblast_parser.add_argument('output_dir', help="output directory")
    rblast_parser.add_argument('-e', '--evalue', type=float, default=1e-3, help="e-value for reporting valid blast hits")
    rblast_parser.add_argument('-p', '--per_identity', type=float, default=30.0, help="percent identity for reporting valid blast hits")
    rblast_parser.add_argument('-x', '--protein_ext', default='faa', help="extension of amino acid gene files (other files in directory are ignored)")
    rblast_parser.add_argument('--blastp', action="store_true", default=False, help="use blastp instead of diamond")
    rblast_parser.add_argument('--tmpdir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
    rblast_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # calculate AAI between homologs genes
    aai_parser = subparsers.add_parser('aai',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate the AAI between homologs in genome pairs.')
    aai_parser.add_argument('rblast_dir', help="directory containing results of reciprocal blast between genome pairs")
    aai_parser.add_argument('output_dir', help="output directory")
    aai_parser.add_argument('-p', '--per_identity', type=float, default=30.0, help="percent identity for defining homology")
    aai_parser.add_argument('-a', '--per_aln_len', type=float, default=70.0, help="percent alignment length of query sequence for defining homology")
    aai_parser.add_argument('-x', '--protein_ext', default='faa', help="extension of amino acid gene files (other files in directory are ignored)")
    aai_parser.add_argument('--write_shared_genes', help="write homologous genes in each genome pair to file", action='store_true')
    aai_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # runs call_genes, blast, and calculate
    aai_wf_parser = subparsers.add_parser('aai_wf',
                                        formatter_class=CustomHelpFormatter,
                                        description='Runs call_genes, blast, and calculate.')
    aai_wf_parser.add_argument('genome_dir', help="directory containing genomes")
    aai_wf_parser.add_argument('output_dir', help="output directory")
    aai_wf_parser.add_argument('--proteins', action="store_true", default=False, help="files in genome directory contain genes as amino acids")
    aai_wf_parser.add_argument('-e', '--evalue', type=float, default=1e-3, help="e-value cutoff for identifying initial blast hits")
    aai_wf_parser.add_argument('-p', '--per_identity', type=float, default=30.0, help="percent identity for defining homology")
    aai_wf_parser.add_argument('-a', '--per_aln_len', type=float, default=70.0, help="percent alignment length of query sequence for defining homology")
    aai_wf_parser.add_argument('--blastp', action="store_true", default=False, help="use blastp instead of diamond")
    aai_wf_parser.add_argument('-x', '--genome_ext', default='fna', help="extension of genomes (other files in directory are ignored)")
    aai_wf_parser.add_argument('--write_shared_genes', help="write homologous genes in each genome pair to file", action='store_true')
    aai_wf_parser.add_argument('--force_table', type=int, default=None, help="force use of specific translation table")
    aai_wf_parser.add_argument('--tmpdir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
    aai_wf_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # calculate amino acid usage
    aa_usage_parser = subparsers.add_parser('aa_usage',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate amino acid usage within each genome.')
    aa_usage_parser.add_argument('protein_dir', help="directory containing fasta files with called genes in amino acid space")
    aa_usage_parser.add_argument('output_file', help="file indicating genes amino acid usage for each genome")
    aa_usage_parser.add_argument('-x', '--protein_ext', default='genes.faa', help="extension of amino acid gene files (other files in directory are ignored)")
    aa_usage_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # calculate codon usage
    codon_parser = subparsers.add_parser('codon_usage',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate codon usage within each genome.')
    codon_parser.add_argument('gene_dir', help="directory containing fasta files with called genes in nucleotide space")
    codon_parser.add_argument('output_file', help="output file indicating codon usage of each genome")
    codon_parser.add_argument('--gene_ext', default='fna', help="extension of gene files (other files in directory are ignored)")
    codon_parser.add_argument('--keep_ambiguous', action='store_true', help="keep codons with ambiguous bases")
    codon_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # calculate kmer usage
    kmer_parser = subparsers.add_parser('kmer_usage',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate kmer usage within each genome.')
    kmer_parser.add_argument('genome_dir', help="directory containing genomes")
    kmer_parser.add_argument('output_file', help="output file indicating kmer usage of each genome")
    kmer_parser.add_argument('-k', type=int, default=4, help="length of kmers, e.g., 4 -> tetranucleotides")
    kmer_parser.add_argument('-x', '--genome_ext', default='fna', help="extension of genomes (other files in directory are ignored)")
    kmer_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # calculate stop codon usage
    stop_codon_parser = subparsers.add_parser('stop_usage',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate stop codon usage within each genome.')
    stop_codon_parser.add_argument('gene_dir', help="directory containing fasta files with called genes in nucleotide space")
    stop_codon_parser.add_argument('output_file', help="output file indicating stop codon usage of each genome")
    stop_codon_parser.add_argument('-x', '--gene_ext', default='genes.fna', help="extension of gene files (other files in directory are ignored)")
    stop_codon_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # identify LGT using dinucleotide (3rd, 1st) usage
    lgt_di_parser = subparsers.add_parser('lgt_di',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate dinuceotide (3rd,1st) usage of genes  to identify putative LGT events.')
    lgt_di_parser.add_argument('gene_dir', help="directory containing fasta files with called genes in nucleotide space")
    lgt_di_parser.add_argument('output_dir', help="output directory to write dinucleotide usage for each gene in each genome")
    lgt_di_parser.add_argument('--gene_ext', default='fna', help="extension of gene files (other files in directory are ignored)")
    lgt_di_parser.add_argument('--crit_value', type=float, default=0.001, help="critical value for defining deviant genes")
    lgt_di_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # identify LGT using codon usage
    lgt_codon_parser = subparsers.add_parser('lgt_codon',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate codon usage of genes to identify putative LGT events.')
    lgt_codon_parser.add_argument('gene_dir', help="directory containing fasta files with called genes in nucleotide space")
    lgt_codon_parser.add_argument('output_dir', help="output directory to write dinucleotide usage for each gene in each genome")
    lgt_codon_parser.add_argument('-x', '--gene_ext', default='fna', help="extension of gene files (other files in directory are ignored)")
    lgt_codon_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)

    # unique
    unique_parser = subparsers.add_parser('unique',
                                        formatter_class=CustomHelpFormatter,
                                        description='Identify genes present in a single genome.')
    unique_parser.add_argument('gene_dir', help="output directory specific with 'ortholog'")
    unique_parser.add_argument('output_file', help="file indicating genes identified as unique to a single genome")
    unique_parser.add_argument('-e', '--evalue', type=float, default=1e-3, help="e-value cutoff for identifying initial blast hits")
    unique_parser.add_argument('-p', '--per_identity', type=float, default=30.0, help="percent identity for defining orthology")
    unique_parser.add_argument('-a', '--per_aln_len', type=float, default=70.0, help="percent alignment length of query sequence for defining othology")

    # produce PCoA plot
    pcoa_plot_parser = subparsers.add_parser('pcoa_plot',
                                        formatter_class=CustomHelpFormatter,
                                        description='Generate PCoA plot indicating relative similarity of genomes.')
    pcoa_plot_parser.add_argument('aai_summary_file', help="file indicating pairwise AAI between genomes")
    pcoa_plot_parser.add_argument('output_file', help="output PCA plot")
    pcoa_plot_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    pcoa_plot_parser.add_argument('--height', type=float, default=6.5, help='height of output image')
    pcoa_plot_parser.add_argument('--image_type', default='png', choices=['eps', 'pdf', 'png', 'ps', 'svg'], help='desired image type')
    pcoa_plot_parser.add_argument('--dpi', type=int, default=600, help='desired DPI of output image')
    pcoa_plot_parser.add_argument('--font_size', type=int, default=8, help='Desired font size')

    # produce a heatmap
    heatmap_parser = subparsers.add_parser('heatmap',
                                        formatter_class=CustomHelpFormatter,
                                        description='Generate heatmap indicating relative similarity of genomes.')
    heatmap_parser.add_argument('aai_summary_file', help="File indicating pairwise AAI between genomes")
    heatmap_parser.add_argument('output_file', help="Output name for the heatmap. File extension is used to determine image type (e.g. png, pdf, svg)")
    # heatmap_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    # heatmap_parser.add_argument('--height', type=float, default=6.5, help='height of output image')
    # heatmap_parser.add_argument('--image_type', default='png', choices=['eps', 'pdf', 'png', 'ps', 'svg'], help='desired image type')
    # heatmap_parser.add_argument('--dpi', type=int, default=600, help='desired DPI of output image')
    # heatmap_parser.add_argument('--font_size', type=int, default=8, help='Desired font size')

    # get and check options
    args = None
    if(len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv == '--help'):
        print_help()
        sys.exit(0)
    else:
        args = parser.parse_args()

    # do what we came here to do
    try:
        parser = OptionsParser()
        if(False):
            # import pstats
            # p = pstats.Stats('prof')
            # p.sort_stats('cumulative').print_stats(10)
            # p.sort_stats('time').print_stats(10)
            import cProfile
            cProfile.run('parser.parse_options(args)', 'prof')
        elif False:
            import pdb
            pdb.run(parser.parse_options(args))
        else:
            parser.parse_options(args)
    except SystemExit:
        print "\n  Controlled exit resulting from an unrecoverable error or warning."
    except:
        print "\nUnexpected error:", sys.exc_info()[0]
        raise
