#!/usr/bin/env python

# david - perform DAVID enrichments
#
# This program is part of AGEpy
#
# Author: Sven E. Templer <sven.templer@gmail.com>
#
# Copyright (c) 2016 - Bioinformatics Core Facility at the
#                      Max Planck Institute for Biology of Ageing,
#                      Cologne, Germany

import argparse

# set
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
# input file
parser.add_argument("table", help= "Input file[s], format is text or xls[x], same for each file and auto-detected.", nargs = '+')
# query
parser.add_argument("-t", "--DAVIDtypes", metavar = "LIST", help = "DAVID category types to enrich for. Comma separated string.",
    default = 'GOTERM_BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,BIOCARTA,PFAM,PROSITE')
parser.add_argument("-i", "--DAVIDid", metavar = "ID", help = "DAVID data format id.", 
    default = 'ENSEMBL_GENE_ID')
parser.add_argument("-u", "--DAVIDuser", metavar = "USER", help = "DAVID user (email).",
    default = None)
parser.add_argument("-p", "--max-pvalue", metavar = "P", help = "Maximum p-value for enrichment", default = 0.05, type = float)
parser.add_argument("-n", "--min-genes", metavar = "N", help = "Minimum number of genes in term", default = 2, type = int) 
# input
parser.add_argument("-d", "--delimiter", help = "For text input files, the column separator", default = '\t')
#parser.add_argument("-b", "--background_table", help="Optional background table.Excel file with one sheet. Each column has the name of the set on the column header and the gene names for the genes as in the respective GTF file ", default='NONE')
#parser.add_argument("-e", "--gene_expression", help="Optional gene expression table.", default='NONE')
parser.add_argument("-c", "--column", metavar = 'N', help = "Select a column index (0-based) with ids to query. None means all.", default = None, type = int)
#parser.add_argument("-c", "--column", metavar = 'N', help = "Select a column index (0-based) with ids to query. None means all.", default = None, type = int, nargs='+')
parser.add_argument("-s", "--sheet", help="If input is an xls[x] file, select a sheet. None means all.", default = None, nargs = '+')
parser.add_argument("-g", "--gtf-table", help="A gtf file to convert the ids in the input tables from gene_name to gene_id (as in ENSEMBL gtf files).", default = None)
# output
parser.add_argument("-o", "--output_folder", help = "Output folder", default = 'david_output')
parser.add_argument("-v", "--verbose", help = "Be verbose", action = "store_true")
# parse
args=parser.parse_args()


import time
import codecs
from datetime import datetime
import pandas
import os
import sys
import AGEpy.AGEpy as AGEpy


sheets = args.sheet
if sheets is None:
  sheets = [""]

if not os.path.exists(args.output_folder):
  os.makedirs(args.output_folder)

if args.DAVIDuser is None:
  print "error: missing david user id"
  sys.exit(1)

gtf = args.gtf_table
if gtf is not None:
  if args.verbose:
    print "Reading gtf '" + gtf + "'"
  gtf = AGEpy.readGTF(gtf)
  if args.verbose:
    print "* fetching attributes"
  gname = AGEpy.retrieve_GTF_field('gene_name', gtf)
  gid = AGEpy.retrieve_GTF_field('gene_id', gtf)
  gname.columns = ['2']
  gtf = pandas.concat([gname, gid], axis = 1).drop_duplicates().dropna()
  gtf.columns = ['gene_name','gene_id']
  if args.verbose:
    print gtf.head()

for f in args.table:
  if args.verbose:
    print "Reading table '" + f + "'"
  f_name, f_suffix = os.path.splitext(os.path.basename(f))
  f_fmt = AGEpy.getFileFormat(f)
  if f_fmt in ["xls", "xlsx"] and args.sheet is None:
    sheets = pandas.ExcelFile(f).sheet_names
  for s in sheets:
    s_name = s
    if s is "":
      s = None
    if args.verbose and s is not None:
      print "* getting sheet '" + s_name + "'"
    d = AGEpy.readDataFrame(f, sheet = s, sep = args.delimiter)
    #if args.verbose:
    #  print d.head()
    cols = d.columns.values.tolist()
    if args.column is not None:
      #cols = list(cols[i] for i in args.column) # for nargs='+'
      cols = [cols[args.column]]
    if args.verbose:
      print "* using columns: " + ', '.join(cols)
    for c in cols:
      dc = d[[c]]
      dc_col = 'gene_name'
      dc.columns = [dc_col]
      c_name = str(c) # take first row/header?
      if gtf is not None:
        if args.verbose:
          print "* translating ids"
        dc = pandas.merge(dc, gtf, left_on = [dc_col], right_on = ['gene_name'], how = 'left').dropna()
        dc_col = 'gene_id'
        dc = dc[[dc_col]].drop_duplicates()
      if args.verbose:
        dc_n = str(len(dc))
        dc_head = dc[dc_col].head().tolist()
        dc_head = map(str, dc_head)
        print "* querying for colum '" + c + "' n = " + dc_n
        print ', '.join(dc_head) + ', ...'
        sys.stdout.flush()
      dc_nmissing = dc[dc_col].isnull().sum()
      print "* missing values: " + str(dc_nmissing)
      dc = dc[dc_col].dropna().tolist()
      dc_david = AGEpy.DAVIDenrich(args.DAVIDid, args.DAVIDtypes, args.DAVIDuser, 
          dc, p = args.max_pvalue, n = args.min_genes, verbose = args.verbose)
      if dc_david is None:
        print "* no results, no output"
        continue
      #print dc_david.head()
      if gtf is not None:
        dc_david = AGEpy.id_nameDAVID(dc_david, name_id = gtf)
      out_name = args.output_folder + '/' + f_name + '_' + s_name + '_' + c_name + ".xlsx" #f_suffix
      print "* writing " + out_name
      writer = pandas.ExcelWriter(out_name)
      terms = list(set(dc_david['categoryName'].tolist()))
      for t in terms:
        dc_david_t = dc_david[dc_david['categoryName']==t]
        dc_david_t.to_excel(writer, t, index = False)
      writer.save()

print "Done"

