import hashlib
import os
import shutil
import subprocess
import sys

from snakemake.logging import logger
from snakemake.utils import report


def get_conda_env():
    if config.get("environment"):
        yml = config.get("environment")
    else:
        yml = os.path.join(
            os.path.dirname(os.path.abspath(workflow.snakefile)), "environment.yml"
        )
    if not os.path.exists(yml):
        sys.exit("Unable to locate the environmental definitions file; tried %s" % yml)
    return yml


def _make_gen(reader):
    """https://stackoverflow.com/a/27518377"""
    b = reader(1024 * 1024)
    while b:
        yield b

        b = reader(1024 * 1024)


def count_sequences(filename):
    """https://stackoverflow.com/a/27518377"""
    if filename.endswith(".gz"):
        import gzip

        f = gzip.open(filename, "rb")
    else:
        f = open(filename, "rb")
    f_gen = _make_gen(f.raw.read)
    return sum(buf.count(b'\n') for buf in f_gen) / 4


def get_sample_files(fastq_dir, prefilter_file_size):
    if not fastq_dir:
        logger.error(
            (
                "'fastq_dir' has not been set -- this directory "
                "should contain your input FASTQs"
            )
        )
        sys.exit(1)
    if os.path.isfile(fastq_dir):
        logger.error(
            (
                "'fastq_dir' must be a directory or a file pattern, "
                "e.g. '*.fastq', with single quotes surrounding the pattern."
            )
        )
        sys.exit(1)
    if "*" in fastq_dir:
        from glob import glob

        logger.info("Finding samples matching %s" % fastq_dir)
        raw_fastq_files = glob(fastq_dir)
    else:
        logger.info("Finding samples in %s" % fastq_dir)
        raw_fastq_files = [os.path.join(fastq_dir, i) for i in os.listdir(fastq_dir)]
    samples = dict()
    seen = set()
    omitted = dict()
    for fq_path in raw_fastq_files:
        fname = os.path.basename(fq_path)
        fastq_dir = os.path.dirname(fq_path)
        if not ".fastq" in fname and not ".fq" in fname:
            continue

        sample_id = fname.partition(".fastq")[0]
        if ".fq" in sample_id:
            sample_id = fname.partition(".fq")[0]
        sample_id = sample_id.replace("_R1", "").replace("_r1", "").replace(
            "_R2", ""
        ).replace(
            "_r2", ""
        )
        sample_id = sample_id.replace(".", "_").replace(" ", "_").replace("-", "_")
        # sample ID after rename
        if sample_id in seen:
            # but FASTQ has yet to be added
            # if one sample has a dash and another an underscore, this
            # is a case where we should warn the user that this file
            # is being skipped
            if not fq_path in seen:
                logger.warning(
                    "Duplicate sample %s was found after renaming; skipping..." %
                    sample_id
                )
            continue

        # which read index do we have of the pair?
        if "_R1" in fname or "_r1" in fname:
            idx = "r1"
            # simple replace of right-most read index designator
            if fname.find("_R1") > fname.find("_r1"):
                other_idx = os.path.join(fastq_dir, "_R2".join(fname.rsplit("_R1", 1)))
            else:
                other_idx = os.path.join(fastq_dir, "_r2".join(fname.rsplit("_r1", 1)))
        elif "_R2" in fname or "_r2" in fname:
            idx = "r2"
            if fname.find("_R2") > fname.find("_r2"):
                other_idx = os.path.join(fastq_dir, "_R1".join(fname.rsplit("_R2", 1)))
            else:
                other_idx = os.path.join(fastq_dir, "_r1".join(fname.rsplit("_r2", 1)))
        else:
            logger.error(
                "Unable to determine read index for [%s] as it is missing '_R1' or '_R2' designation. Exiting." %
                fname
            )
            sys.exit(1)
        # naming convention not followed and nothing was renamed
        if other_idx == fq_path:
            logger.error("Unable to locate matching index for %s. Exiting." % fq_path)
            sys.exit(1)
        # not paired-end?
        if not os.path.exists(other_idx):
            logger.error(
                "File [%s] for %s was not found. Exiting." % (other_idx, sample_id)
            )
            sys.exit(1)
        seen.add(fq_path)
        seen.add(other_idx)
        seen.add(sample_id)
        # omissions
        if os.path.getsize(fq_path) < prefilter_file_size or os.path.getsize(
            other_idx
        ) < prefilter_file_size:
            c = count_sequences(fq_path)
            logger.info(
                (
                    "%s is being omitted from analysis due to its "
                    "file size (%d paired sequences)."
                ) %
                (sample_id, c)
            )
            omitted["%s **OMITTED**" % sample_id] = c
            fastq_paths = None
        else:
            if idx == "r1":
                fastq_paths = {"r1": fq_path, "r2": other_idx}
            else:
                fastq_paths = {"r1": other_idx, "r2": fq_path}
        if fastq_paths:
            samples[sample_id] = fastq_paths
    logger.info("\nFound %d samples for processing:\n" % len(samples))
    samples_str = ""
    for k, v in samples.items():
        samples_str += "%s: %s; %s\n" % (k, v["r1"], v["r2"])
    logger.info(samples_str)
    return samples, omitted


def get_cluster_sequences_input(wildcards):
    return "chimera-filtered_denovo.fasta" if config.get(
        "denova_chimera_filter"
    ) else "dereplicated-sequences.fasta"


def get_run_reference_chimera_filter_inputs(wildcards):
    if config.get("reference_chimera_filter") is True:
        db = REFFASTA
    else:
        db = config.get("reference_chimera_filter")
    if config.get("denovo_chimera_filter", True):
        fasta = "chimera-filtered_denovo.fasta"
    elif float(config.get("read_identity_requirement", 0.97)) * 100 < 99:
        fasta = "preclustered.fasta"
    else:
        fasta = "dereplicated-sequences.fasta"
    return {"fasta": fasta, "db": db}


def filter_fastas(fasta1, uclust, fasta2, output):
    """fasta1 is the larger set of reads to be filtered
    by read IDs in uclust and fasta2"""
    accepted_reads = set()
    with open(fasta2) as fh:
        for line in fh:
            if line.startswith(">"):
                name = line.strip().strip(">").partition(";")[0]
                accepted_reads.add(name)
    with open(uclust) as fh:
        for line in fh:
            toks = line.strip().split("\t")
            if toks[9] == "*":
                continue

            if toks[8] == "*":
                continue

            a = toks[8].partition(";")[0]
            b = toks[9].partition(";")[0]
            if b in accepted_reads:
                accepted_reads.add(a)
    with open(fasta1) as fh, open(output, "w") as fo:
        keep = False
        for line in fh:
            line = line.strip()
            if line.startswith(">"):
                name = line.strip(">").partition(";")[0]
                if name in accepted_reads:
                    keep = True
                else:
                    keep = False
            if keep:
                print(line, file=fo)


def md5(fname):
    # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
    hash_md5 = hashlib.md5()
    if not os.path.exists(fname):
        return None

    with open(fname, "rb") as f:
        for chunk in iter( lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


PROTOCOL_VERSION = subprocess.check_output("hundo --version", shell=True).decode("utf-8").strip()
if config.get("workflow") == "download":
    SAMPLES, OMITTED = {"placeholder": {"r1": "placeholder", "r2": "placeholder"}}
else:
    SAMPLES, OMITTED = get_sample_files(
        config.get("fastq_dir"), config.get("prefilter_file_size", 100000)
    )
CONDAENV = get_conda_env()
REFERENCES = {
    "greengenes.fasta.nhr": "200f1b4356e59be524525e4ca83cefc1",
    "greengenes.fasta.nin": "f41470ebc95c21f13b54f7b127ebe2ad",
    "greengenes.fasta.nsq": "150b00866a87f44c8b33871be7cc6b98",
    "greengenes.map": "cb49d30e2a00476f8bdaaa3eaec693ef",
    "greengenes.tre": "b3a369edde911bf0aa848a842f736fce",
    "silvamod128.fasta.nhr": "99d9b6817a9c6a0249fbb32a60e725ae",
    "silvamod128.fasta.nin": "25d8d6587123aa7dbaabaa1299e481cc",
    "silvamod128.fasta.nsq": "7e12efc4ab9019c9d79809861922a001",
    "silvamod128.map": "00500c7f215a89a3240c907af4f75f33",
    "silvamod128.tre": "340864d9d32e8561f75ec0b0a9a6c3d1",
    "unite.fasta.nhr": "3ce60ad0d3eee81246c6915e64bf3122",
    "unite.fasta.nin": "962c9364b6cb3da2b9457546838bb6fb",
    "unite.fasta.nsq": "796e7338cdbb3d659951e380fd2b8104",
    "unite.map": "11984b9f45fbac120824de516a133e45",
    "unite.tre": "d50a5f427c22964906f2d6fc1b7cf220",
}
REF = [k for k in REFERENCES.keys() if k.startswith(config.get("reference_database", "silva"))]
BLASTREF = [os.path.join(config.get("database_dir", "."), i) for i in REF]
REFFASTA = [os.path.join(config.get("database_dir", "."), i) for i in REF if ".fasta" in i][0].rpartition(".")[0]
REFMAP = [os.path.join(config.get("database_dir", "."), i) for i in REF if i.endswith(".map")][0]
REFTRE = [os.path.join(config.get("database_dir", "."), i) for i in REF if i.endswith(".tre")][0]
STYLE = """
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.1/css/bootstrap.css"/>
    <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css"/>
    <style type="text/css">
    body{font-family:Helvetica,arial,sans-serif;font-size:14px;line-height:1.6;padding-bottom:10px;background-color:#fff;color:#333;margin:0}body>div .section::before{content:"";display:block;height:80px;margin:-80px 0 0}#summary::before{margin:0}.topic-title{font-size:18pt}body>div>.section{margin-left:22%;margin-bottom:3em}div.section{margin-right:20px}#contents>p{display:none}button,li p.first{display:inline-block}#contents{margin-top:80px;padding-left:0;width:20%;background-color:#f1f1f1;height:100%;position:fixed;overflow:auto}#contents ul{list-style-type:none}#contents ul>li{font-size:14pt}#contents ul>li a:hover{color:#151d26}button,h1.title{color:#fff;background-color:#151d26}#contents ul>li>ul>li{font-size:12pt}h1.title{margin-top:0;position:fixed;z-index:10;padding:20px;width:100%}code,table tr:nth-child(2n),tt{background-color:#f8f8f8}.one-col{min-width:310px;height:500px;margin:0 auto}.two-col-left{height:300px;width:49%;float:left}.two-col-right{height:300px;width:49%;float:right}button{margin:0 5px 0 0;padding:5px 25px;font-size:18px;line-height:1.8;appearance:none;box-shadow:none;border-radius:3px;border:none}button:focus{outline:0}button:hover{background-color:#4183C4}button:active{background-color:#27496d}.legend-rect{width:20px;height:20px;margin-right:8px;margin-left:20px;float:left;-webkit-border-radius:2px;border-radius:2px}a{color:#4183C4;text-decoration:none}a.absent{color:#c00}a.anchor{padding-left:30px;margin-left:-30px;cursor:pointer;position:absolute;top:0;left:0;bottom:0}dl,dl dt,dl dt:first-child,hr,table,table tr{padding:0}table tr td,table tr th{border:1px solid #ccc;text-align:left;padding:6px 13px}h1,h2,h3,h4,h5,h6{margin:20px 0 10px;padding:0;font-weight:700;-webkit-font-smoothing:antialiased;cursor:text;position:relative}h1:hover a.anchor,h2:hover a.anchor,h3:hover a.anchor,h4:hover a.anchor,h5:hover a.anchor,h6:hover a.anchor{text-decoration:none}h1 code,h1 tt,h2 code,h2 tt,h3 code,h3 tt,h4 code,h4 tt,h5 code,h5 tt,h6 code,h6 tt{font-size:inherit}h1{font-size:28px;color:#151d26;border-bottom:1px solid #ccc}h2{font-size:24px;color:#000}h3{font-size:18px}h4{font-size:16px}dl dt,h5,h6{font-size:14px}h6{color:#777}blockquote,dl,li,ol,p,pre,table,ul{margin:15px 0}hr{background:url(http://tinyurl.com/bq5kskr) repeat-x;border:0;color:#ccc;height:4px}a:first-child h1,a:first-child h2,a:first-child h3,a:first-child h4,a:first-child h5,a:first-child h6{margin-top:0;padding-top:0}h1 p,h2 p,h3 p,h4 p,h5 p,h6 p{margin-top:0}dl dt{font-weight:700;font-style:italic;margin:15px 0 5px}blockquote>:first-child,dl dd>:first-child,dl dt>:first-child,table tr td :first-child,table tr th :first-child{margin-top:0}blockquote>:last-child,dl dd>:last-child,dl dt>:last-child{margin-bottom:0}dl dd{margin:0 0 15px;padding:0 15px}blockquote{border-left:4px solid #ddd;padding:0 15px;color:#777}table{border-spacing:0;border-collapse:collapse}table tr{border-top:1px solid #ccc;background-color:#fff;margin:0}table tr th{font-weight:700;margin:0}table tr td{margin:0}table tr td :last-child,table tr th :last-child{margin-bottom:0}img{max-width:100%}span.frame{display:block;overflow:hidden}span.frame>span{border:1px solid #ddd;display:block;float:left;overflow:hidden;margin:13px 0 0;padding:7px;width:auto}span.frame span img{display:block;float:left}span.frame span span{clear:both;color:#333;display:block;padding:5px 0 0}span.align-center{display:block;overflow:hidden;clear:both}span.align-center>span{display:block;overflow:hidden;margin:13px auto 0;text-align:center}span.align-center span img{margin:0 auto;text-align:center}span.align-right{display:block;overflow:hidden;clear:both}span.align-right>span{display:block;overflow:hidden;margin:13px 0 0;text-align:right}span.align-right span img{margin:0;text-align:right}span.float-left{display:block;margin-right:13px;overflow:hidden;float:left}span.float-left span{margin:13px 0 0}span.float-right{display:block;margin-left:13px;overflow:hidden;float:right}span.float-right>span{display:block;overflow:hidden;margin:13px auto 0;text-align:right}code,tt{margin:0 2px;padding:0 5px;white-space:nowrap;border:1px solid #eaeaea;border-radius:3px}pre code{margin:0;padding:0;white-space:pre;background:0 0}.highlight pre,pre{background-color:#f8f8f8;border:1px solid #ccc;font-size:13px;line-height:19px;overflow:auto;padding:6px 10px;border-radius:3px}pre code,pre tt{background-color:transparent;border:none}div#metadata{text-align:right}h1{line-height:1.6}
    </style>
"""


rule all:
    input:
        "summary.html",
        "PROTOCOL-VERSION.txt"


rule print_protocol_version:
    output:
        "PROTOCOL-VERSION.txt"
    shell:
        "hundo --version > {output}"


rule download_reference_data:
    output:
        "%s/{filename}" % config.get("database_dir", ".")
    run:
        shell("curl 'https://zenodo.org/record/1043977/files/{wildcards.filename}' -s > {output}")
        if not REFERENCES[os.path.basename(output[0])] == md5(output[0]):
            raise OSError(2, "Invalid checksum", output[0])


rule create_fasta_from_reference:
    input:
        BLASTREF
    output:
        REFFASTA
    params:
        db = lambda wildcards, input: os.path.join(os.path.dirname(input[0]), "%s.fasta" % os.path.basename(input[0]).partition(".")[0])
    conda:
        CONDAENV
    shell:
        """blastdbcmd -db {params.db} -outfmt %f -entry all -out {output}"""


rule get_raw_r1_fastq_qualities:
    input:
        unpack(lambda wc: SAMPLES[wc.sample])
    output:
        r1 = "logs/{sample}_R1_eestats.txt",
    conda:
        CONDAENV
    threads:
        1
    group:
        "sample_group"
    shell:
        """
        vsearch --threads {threads} --fastq_eestats {input.r1} --output {output.r1}
        """


rule get_raw_r2_fastq_qualities:
    input:
        unpack(lambda wc: SAMPLES[wc.sample])
    output:
        r2 = "logs/{sample}_R2_eestats.txt",
    conda:
        CONDAENV
    threads:
        1
    group:
        "sample_group"
    shell:
        """
        vsearch --threads {threads} --fastq_eestats {input.r2} --output {output.r2}
        """


rule trim_and_filter_reads:
    input:
        unpack(lambda wc: SAMPLES[wc.sample])
    output:
        r1 = temp("tmp/{sample}_R1.fastq"),
        r2 = temp("tmp/{sample}_R2.fastq"),
        stats = "logs/{sample}_quality_filtering_stats.txt"
    benchmark:
        "logs/benchmarks/quality_filter/{sample}.txt"
    params:
        lref = "lref=%s" % config.get("filter_adapters") if config.get("filter_adapters") else "",
        rref = "rref=%s" % config.get("filter_adapters") if config.get("filter_adapters") else "",
        fref = "fref=%s" % config.get("filter_contaminants") if config.get("filter_contaminants") else "",
        mink = "" if not config.get("filter_adapters") and not config.get("filter_contaminants") else "mink=%d" % config.get("reduced_kmer_min", 8),
        trimq = config.get("minimum_base_quality", 10),
        hdist = "" if not config.get("filter_adapters") and not config.get("filter_contaminants") else "hdist=%d" % config.get("allowable_kmer_mismatches", 1),
        k = "" if not config.get("filter_adapters") and not config.get("filter_contaminants") else "k=%d" % config.get("reference_kmer_match_length", 27),
        qtrim = config.get("qtrim", "rl"),
        minlength = config.get("minimum_passing_read_length", 100)
    conda:
        CONDAENV
    log:
        "logs/{sample}_quality_filter.log"
    threads:
        max(int(config.get("threads", 4) * 0.25), 1)
    group:
        "sample_group"
    shell:
        """
        bbduk2.sh in={input.r1} in2={input.r2} out={output.r1} \
            out2={output.r2} {params.rref} {params.lref} {params.fref} \
            {params.mink} qout=33 stats={output.stats} {params.hdist} \
            {params.k} trimq={params.trimq} qtrim={params.qtrim}
            threads={threads} minlength={params.minlength} \
            overwrite=true 2> {log}
        """


rule count_raw_reads:
    input:
        "logs/{sample}_quality_filtering_stats.txt"
    output:
        "logs/raw_counts/{sample}_R1.count"
    # unsure if this will work
    group:
        "sample_group"
    run:
        with open(input[0]) as filein, open(output[0], "w") as fileout:
            for line in filein:
                if line.startswith("#Total"):
                    total = int(line.strip().split("\t")[1]) / 2.
                    print(total, file=fileout)
                    break


rule count_filtered_reads:
    input:
        "tmp/{sample}_R1.fastq"
    output:
        "logs/filtered_counts/{sample}_R1.count"
    group:
        "sample_group"
    shell:
        "awk '{{n++}}END{{print int(n/4)}}' {input} > {output}"


rule merge_reads:
    input:
        r1 = "tmp/{sample}_R1.fastq",
        r2 = "tmp/{sample}_R2.fastq"
    output:
        temp("tmp/{sample}_merged.fastq")
    params:
        minimum_merge_length = config.get("minimum_merge_length", 150),
        fastq_allowmergestagger = "--fastq_allowmergestagger" if config.get("fastq_allowmergestagger", False) else "",
        fastq_maxdiffs = config.get("fastq_maxdiffs", 5),
        fastq_minovlen = config.get("fastq_minovlen", 16)
    conda:
        CONDAENV
    log:
        "logs/{sample}_merge_reads.log"
    group:
        "sample_group"
    shell:
        """
        vsearch --fastq_mergepairs {input.r1} --reverse {input.r2} \
            --label_suffix \;sample={wildcards.sample}\; \
            --fastq_minmergelen {params.minimum_merge_length} \
            {params.fastq_allowmergestagger} \
            --fastq_maxdiffs {params.fastq_maxdiffs} \
            --fastq_minovlen {params.fastq_minovlen} \
            --fastqout {output} --log {log}
        """


rule count_merged_reads:
    input:
        "tmp/{sample}_merged.fastq"
    output:
        "logs/merged_counts/{sample}_R1.count"
    group:
        "sample_group"
    shell:
        "awk '{{n++}}END{{print int(n/4)}}' {input} > {output}"


rule get_prefilter_fastq_stats:
    input:
        "tmp/{sample}_merged.fastq"
    output:
        "logs/{sample}_merged_eestats.txt"
    conda:
        CONDAENV
    threads:
        1
    group:
        "sample_group"
    shell:
        """
        vsearch --threads {threads} --fastq_eestats {input} \
            --output {output}
        """


rule filter_merged_reads:
    input:
        "tmp/{sample}_merged.fastq"
    output:
        fa = temp("tmp/{sample}_merged_filtered.fasta"),
        fq = temp("tmp/{sample}_merged_filtered.fastq")
    params:
        maxee = config.get("maximum_expected_error", 1),
        maxns = config.get("vsearch_maxns", 0)
    conda:
        CONDAENV
    log:
        "logs/{sample}_fastq_filter.log"
    group:
        "sample_group"
    shell:
        """
        vsearch --fastq_filter {input} --fastq_maxee {params.maxee} \
            --fastq_maxns {params.maxns} --fastaout {output.fa} \
            --fastqout {output.fq}
        """


rule get_postfilter_fastq_stats:
    input:
        "tmp/{sample}_merged_filtered.fastq"
    output:
        "logs/{sample}_merged_filtered_eestats.txt"
    conda:
        CONDAENV
    threads:
        1
    group:
        "sample_group"
    shell:
        """
        vsearch --threads {threads} --fastq_eestats {input} \
            --output {output}
        """


rule dereplicate_sequences_by_sample:
    input:
        "tmp/{sample}_merged_filtered.fasta"
    output:
        temp("tmp/{sample}_dereplicated.fasta")
    conda:
        CONDAENV
    log:
        "logs/{sample}_dereplication.log"
    group:
        "sample_group"
    shell:
        """
        vsearch --derep_fulllength {input} --output {output} \
            --strand plus --sizeout --relabel {wildcards.sample}.
        """


rule combine_merged_reads:
    input:
        fastas = expand("tmp/{sample}_dereplicated.fasta", sample=SAMPLES)
    output:
        fasta = "all-sequences.fasta"
    run:
        import shutil

        with open(output.fasta, 'wb') as ofh:
            for f in input.fastas:
                with open(f, 'rb') as ifh:
                    shutil.copyfileobj(ifh, ofh, 1024*1024*10)


rule dereplicate_sequences:
    input:
        "all-sequences.fasta"
    output:
        fa = temp("dereplicated-sequences.fasta"),
        uc = temp("dereplicated.uclust")
    params:
        minuniquesize = config.get("vsearch_minuniquesize", 2)
    conda:
        CONDAENV
    log:
        "logs/dereplicated-sequences.log"
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        vsearch --derep_fulllength {input} --output {output.fa} \
            --sizein --sizeout --minuniquesize {params.minuniquesize} \
            --threads {threads} -log {log} --uc {output.uc}
        """


rule precluster_sequences:
    input:
        "dereplicated-sequences.fasta"
    output:
        fa = temp("preclustered.fasta"),
        uc = temp("preclustered.uclust")
    params:
        id = min((100 - float(config.get("percent_of_allowable_difference", 3))) / 100, 1.00)
    conda:
        CONDAENV
    log:
        "logs/precluster.log"
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        vsearch --threads {threads} --cluster_size {input} \
            --id {params.id} --strand plus --sizein --sizeout \
            --centroids {output.fa} --uc {output.uc}
        """


rule run_denovo_chimera_filter:
    input:
        "preclustered.fasta"
    output:
        temp("chimera-filtered_denovo.fasta")
    conda:
        CONDAENV
    log:
        "logs/chimera-filtered_denovo.log"
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        vsearch --uchime_denovo {input} --nonchimeras {output} \
            --strand plus --sizein --sizeout --threads {threads} \
            --log {log}
        """


rule run_reference_chimera_filter:
    input:
        unpack(get_run_reference_chimera_filter_inputs)
    output:
        temp("chimera-filtered_reference.fasta")
    conda:
        CONDAENV
    log:
        "logs/chimera-filtered_reference.log"
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
            --strand plus --sizein --sizeout --db {input.db} \
            --threads {threads} --log {log}
        """


rule extract_filtered_sequences:
    # Extract all non-chimeric, non-singleton sequences, dereplicated
    input:
        filter_fa = "dereplicated-sequences.fasta",
        uc = "preclustered.uclust",
        keep_fa = "chimera-filtered_reference.fasta"
    output:
        temp("nonchimeric-nonsingleton.fasta")
    group:
        "combined_group"
    run:
        filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0])


rule pull_seqs_from_samples_with_filtered_sequences:
    # Extract all non-chimeric, non-singleton sequences in each sample
    input:
        filter_fa = "all-sequences.fasta",
        uc = "dereplicated.uclust",
        keep_fa = "nonchimeric-nonsingleton.fasta"
    output:
        temp("all-sequences_filtered.fasta")
    group:
        "combined_group"
    run:
        filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0])


rule cluster_sequences:
    input:
        # get_cluster_sequences_input
        "all-sequences_filtered.fasta"
    output:
        fa = "OTU.fasta"
    params:
        minsize = config.get("minimum_sequence_abundance", 2),
        otu_id_pct = (100 - float(config.get("percent_of_allowable_difference", 3))) / 100.
    conda:
        CONDAENV
    log:
        "logs/cluster_sequences.log"
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        vsearch --cluster_size {input} --centroids {output.fa} \
            --relabel OTU_ --id {params.otu_id_pct} --log {log} \
            --sizein --strand plus --threads {threads}
        """


rule run_blast:
    input:
        fasta = "OTU.fasta",
        # force download
        db = BLASTREF,
        db_fa = REFFASTA
    output:
        "blast-hits.txt"
    params:
        # works due to file naming convention
        db = REFFASTA,
        num_alignments = config.get("blast_num_alignments", 12)
    conda:
        CONDAENV
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        blastn -query {input.fasta} -db {params.db} \
            -num_alignments {params.num_alignments} \
            -outfmt 6 -out {output} -num_threads {threads}
        """


rule compute_lca:
    input:
        fasta = "OTU.fasta",
        hits = "blast-hits.txt",
        map = REFMAP,
        tre = REFTRE
    output:
        fasta = "OTU_tax.fasta",
        tsv = temp("OTU_assignments.tsv")
    params:
        min_score = config.get("blast_minimum_bitscore", 125),
        top_fraction = config.get("blast_top_fraction", 0.95)
    group:
        "combined_group"
    shell:
        """
        hundo lca --min-score {params.min_score} \
            --top-fraction {params.top_fraction} \
            {input.fasta} {input.hits} {input.map} \
            {input.tre} {output.fasta} {output.tsv}
        """


rule compile_counts:
    input:
        seqs = "all-sequences.fasta",
        db = "OTU_tax.fasta"
    output:
        tsv = "OTU.txt"
    params:
        id_req = config.get("read_identity_requirement", 0.80),
        minqt = config.get("vsearch_minqt", 0.80)
    conda:
        CONDAENV
    threads:
        config.get("threads", 1)
    group:
        "combined_group"
    shell:
        """
        vsearch --usearch_global {input.seqs} \
            --db {input.db} --id {params.id_req} \
            --otutabout {output.tsv} --sizein --strand plus \
            --threads {threads}
        """


rule create_biom_from_txt:
    input:
        "OTU.txt"
    output:
        "OTU.biom"
    shadow:
        "shallow"
    conda:
        CONDAENV
    group:
        "combined_group"
    shell:
        '''
        sed 's|\"||g' {input} | sed 's|\,|\;|g' > OTU_converted.txt
        biom convert -i OTU_converted.txt -o {output} --to-json \
            --process-obs-metadata sc_separated --table-type "OTU table"
        '''


rule multiple_align:
    input:
        "OTU.fasta"
    output:
        "OTU_aligned.fasta"
    threads:
        # mafft (via conda) is single-threaded on macOS
        config.get("threads", 1)
    conda:
        CONDAENV
    group:
        "combined_group"
    shell:
        """
        mafft --auto --thread {threads} --quiet --maxiterate 1000 \
            --globalpair {input} > {output}
        """


rule newick_tree:
    input:
        "OTU_aligned.fasta"
    output:
        "OTU.tree"
    threads:
        config.get("threads", 1)
    conda:
        CONDAENV
    log:
        "logs/fasttree.log"
    group:
        "combined_group"
    shell:
        """
        export OMP_NUM_THREADS={threads} && \
            FastTreeMP -nt -gamma -spr 4 -log {log} -quiet {input} > {output}
        """


rule make_deliverables_archive:
    input:
        "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt"
    output:
        "hundo_results.zip"
    conda:
        CONDAENV
    group:
        "combined_group"
    shell:
        "zip {output} {input}"


rule report:
    input:
        biom = "OTU.biom",
        txt = "OTU.txt",
        archive = "hundo_results.zip",
        fastq_stats_post = expand("logs/{sample}_merged_filtered_eestats.txt", sample=SAMPLES.keys()),
        raw_r1_quals = expand("logs/{sample}_R1_eestats.txt", sample=SAMPLES.keys()),
        raw_r2_quals = expand("logs/{sample}_R2_eestats.txt", sample=SAMPLES.keys()),
        raw_counts = expand("logs/raw_counts/{sample}_R1.count", sample=SAMPLES.keys()),
        filtered_counts = expand("logs/filtered_counts/{sample}_R1.count", sample=SAMPLES.keys()),
        merged_counts = expand("logs/merged_counts/{sample}_R1.count", sample=SAMPLES.keys())
    shadow:
        "shallow"
    params:
        filter_adapters = "None" if not config.get("filter_adapters") else config.get("filter_adapters"),
        filter_contaminants = "None" if not config.get("filter_contaminants") else config.get("filter_contaminants"),
        allowable_kmer_mismatches = config.get("allowable_kmer_mismatches"),
        reference_kmer_match_length = config.get("reference_kmer_match_length"),
        reduced_kmer_min = config.get("reduced_kmer_min"),
        minimum_passing_read_length = config.get("minimum_passing_read_length"),
        minimum_base_quality = config.get("minimum_base_quality"),
        minimum_merge_length = config.get("minimum_merge_length"),
        fastq_allowmergestagger = "True" if config.get("fastq_allowmergestagger", False) else "False",
        fastq_maxdiffs = config.get("fastq_maxdiffs", 5),
        fastq_minovlen = config.get("fastq_minovlen", 16),
        maximum_expected_error = config.get("maximum_expected_error"),
        reference_chimera_filter = config.get("reference_chimera_filter"),
        minimum_sequence_abundance = config.get("minimum_sequence_abundance"),
        percent_of_allowable_difference = config.get("percent_of_allowable_difference"),
        reference_database = config.get("reference_database"),
        blast_minimum_bitscore = config.get("blast_minimum_bitscore"),
        blast_top_fraction = config.get("blast_top_fraction"),
        read_identity_requirement = config.get("read_identity_requirement")
    output:
        html = "summary.html"
    group:
        "combined_group"
    run:
        import csv
        from collections import defaultdict

        import numpy as np
        import pandas as pd
        import plotly.figure_factory as ff
        import plotly.graph_objs as go
        from biom import parse_table
        from plotly import offline


        PLOTLY_PARAMS = dict(
            include_plotlyjs=False, show_link=False, output_type="div"
        )
        BLUE = "rgba(31,119,180,0.3)"
        RED = "rgba(214,39,40,0.3)"

        def shannon_x(x, total):
            return (x / total) * np.log(x / total) if x > 0 else 0

        shannon_xf = np.vectorize(shannon_x, otypes=[np.float])

        def shannon_index(arr):
            return -1 * sum(shannon_xf(arr, sum(arr)))


        # summary stats: stats from the biom table
        with open(input.biom) as fh:
            bt = parse_table(fh)
            biom_df = bt.to_dataframe()
            s_idx = biom_df.apply(shannon_index, axis=0)
            s_idx.sort_values(inplace=True)
            # sample order for plots
            sample_order = s_idx.index.tolist()
            otu_df = [
                ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"],
                [bt.length("sample"), bt.length("observation"), bt.sum(), '{:f}'.format(bt.get_table_density())]
            ]
            otu_df = pd.DataFrame(otu_df)
            # make the first row the column labels
            otu_df.rename(columns=otu_df.iloc[0], inplace=True)
            # drop the first row
            otu_df = otu_df.reindex(otu_df.index.drop(0))
            # build the table
            summary_table = otu_df.to_html(
                index=False, bold_rows=False, classes=["table", "table-bordered"], table_id="otuTable"
            )
            # fix for pandoc
            summary_table = summary_table.replace("\n", "\n" + 10 * " ")
            del(otu_df)

        # quality plot
        raw_qual_stats = defaultdict(lambda: defaultdict(list))
        for r1_ee_file in input.raw_r1_quals:
            sample_name = r1_ee_file.partition("logs/")[-1].partition("_R1")[0]
            r2_ee_file = "_R2".join(r1_ee_file.rsplit("_R1", 1))
            with open(r1_ee_file) as fh:
                reader = csv.DictReader(fh, delimiter="\t")
                for row in reader:
                    raw_qual_stats["R1"][sample_name].append(float(row["Mean_Q"]))
            with open(r2_ee_file) as fh:
                reader = csv.DictReader(fh, delimiter="\t")
                for row in reader:
                    q = float(row["Mean_Q"])
                    raw_qual_stats["R2"][sample_name].append(q)
        data = []
        for read_index, sample_data in raw_qual_stats.items():
            color = BLUE if read_index == "R1" else RED
            for sample_name, sample_stats in sample_data.items():
                data.append(go.Scatter(
                    x=list(range(1,len(sample_stats))),
                    y=sample_stats,
                    name=sample_name,
                    text=sample_name,
                    hoverinfo="text+x+y",
                    legendgroup=read_index,
                    mode="lines",
                    line=dict(
                        color=color,
                        dash="solid"
                        )
                    )
                )
        layout = go.Layout(
            title='Mean Quality Scores for R1 and R2',
            margin={"b": "auto", "r": "auto"},
            xaxis={"title":"Position"},
            yaxis={"title":"Quality (Phred score)"},
            hovermode="closest",
            showlegend=False,
            autosize=True,
            annotations=[
                dict(
                    x=0,
                    y=1.1,
                    xref="paper",
                    yref="paper",
                    text="Forward",
                    showarrow=False,
                    font=dict(
                        size=16,
                        color="#ffffff"
                    ),
                    align="left",
                    borderpad=4,
                    bgcolor="#1f77b4",
                ),
                dict(
                    x=0.15,
                    y=1.1,
                    xref="paper",
                    yref="paper",
                    text="Reverse",
                    showarrow=False,
                    font=dict(
                        size=16,
                        color='#ffffff'
                    ),
                    align='left',
                    borderpad=4,
                    bgcolor="#d62728",
                )
            ]
        )
        fig = go.Figure(data=data, layout=layout)
        quality_plot = offline.plot(
            fig,
            **PLOTLY_PARAMS
        )

        # taxonomy plot with buttons
        df = pd.read_table(input.txt)
        # sample_cols = [i for i in df.columns if i not in ["#OTU ID", "taxonomy"]]
        df[["kingdom", "phylum", "class", "order", "family", "genus", "species"]] = df["taxonomy"].str.split(",", expand=True)
        tax_dataframes = []
        tax_dataframes_rel = []
        levels = ["phylum", "class", "order"]
        for level in levels:
            sub = df[[level] + sample_order].copy()
            # cols = sub[level].tolist()
            sub[level] = sub[level].str.replace("%s__" % level[0], "")
            # sum same assignments over rows
            sub = sub.groupby([level]).sum().reset_index()
            # sort the stacks
            sub["TotalCount"] = sub[sample_order].sum(axis=1)
            sub.sort_values("TotalCount", ascending=False, inplace=True)
            # redefine columns after sort
            sub = sub[[level] + sample_order]
            # relative abundance dataframe
            sub_p = sub.copy()
            sub_p[sample_order] = sub_p[sample_order].div(sub_p[sample_order].sum())
            sub_p[sample_order] = sub_p[sample_order] * 100
            sub_p = sub_p.transpose()
            sub_p.columns = sub_p.loc[level]
            sub_p.drop([level], inplace=True)
            tax_dataframes_rel.append(sub_p)
            # absolute abundance dataframe
            sub = sub.transpose()
            sub.columns = sub.loc[level]
            sub.drop([level], inplace=True)
            tax_dataframes.append(sub)
        del(df)
        data = []
        lengths = []
        for i, subset in enumerate(tax_dataframes):
            data += [go.Bar(x=subset.index,
                            y=subset[tax],
                            name=tax,
                            text=tax,
                            hoverinfo="text+y",
                            visible=True if i == 0 else False
                            ) for tax in subset.columns]
            lengths.append(len(subset.columns))
        # plot buttons
        updatemenus = list([
            dict(type="buttons",
                 active=0,
                 buttons=list([
                    dict(label = 'Phylum',
                         method = 'update',
                         args = [{'visible': [True]*lengths[0] + [False]*lengths[1]  + [False]*lengths[2]},
                                 {"title":"Assigned Taxonomy Per Sample (Level: Phylum)"}
                                ]),
                    dict(label = 'Class',
                         method = 'update',
                         args = [{'visible': [False]*lengths[0] + [True]*lengths[1]  + [False]*lengths[2]},
                                 {"title":"Assigned Taxonomy Per Sample (Level: Class)"}
                                ]),
                    dict(label = 'Order',
                         method = 'update',
                         args = [{'visible': [False]*lengths[0] + [False]*lengths[1]  + [True]*lengths[2]},
                                 {"title":"Assigned Taxonomy Per Sample (Level: Order)"}
                                ])
                ]),
                 direction = 'left',
                 pad = {'r': 0, 't': 0},
                 showactive = True,
                 x = 0,
                 xanchor = 'left',
                 y = 1.05,
                 yanchor = 'top'
            )
        ])
        layout = dict(title="Assigned Taxonomy Per Sample (Level: Phylum)",
                      updatemenus=updatemenus,
                      barmode="stack",
                      height=700,
                      margin={"b": "auto", "r": "auto"},
                      # xaxis={"tickangle": -60},
                      yaxis={"title": "Count"},
                      showlegend=False,
                      hovermode='closest'
                     )
        fig = go.Figure(data=data, layout=layout)
        taxonomy_plot = offline.plot(
            fig,
            **PLOTLY_PARAMS
        )
        # make the relative abundance taxonomy plot
        data = []
        for i, subset in enumerate(tax_dataframes_rel):
            data += [go.Bar(x=subset.index,
                            y=subset[tax],
                            name=tax,
                            text=tax,
                            hoverinfo="text+y",
                            visible=True if i == 0 else False
                            ) for tax in subset.columns]
        layout["yaxis"]["title"] = "Relative Abundance"
        fig = go.Figure(data=data, layout=layout)
        relative_abundance_taxonomy_plot = offline.plot(
            fig,
            **PLOTLY_PARAMS
        )

        # merged reads quality plot
        ee_stats = defaultdict(list)
        for stats_file in input.fastq_stats_post:
            sample_name = stats_file.partition("logs/")[-1].partition("_merged")[0]
            with open(stats_file) as fh:
                reader = csv.DictReader(fh, delimiter="\t")
                for row in reader:
                    ee_stats[sample_name].append(row["Max_EE"])
        data = []
        for sample_name, sample_stats in ee_stats.items():
            data.append(go.Scatter(
                x=list(range(1,len(sample_stats))),
                y=sample_stats,
                name=sample_name,
                text=sample_name,
                hoverinfo="text+x+y",
                mode="lines",
                line=dict(
                    color=BLUE,
                    dash="solid"
                    )
                )
            )
        layout = go.Layout(
            title='Merged Sequence Quality (Max EE: {})'.format(params.maximum_expected_error),
            margin={"b": "auto", "r": "auto"},
            yaxis={"title":"Quality (Phred score)"},
            xaxis={"title":"Position"},
            hovermode="closest",
            showlegend=False
        )
        fig = go.Figure(data=data, layout=layout)
        merged_reads_quality_plot = offline.plot(
            fig,
            **PLOTLY_PARAMS
        )

        # sample summary table
        count_data = []
        for sample, count in OMITTED.items():
            count_data.append([sample, count, "NA", "NA", "NA", "NA"])
        for sample in sample_order:
            sd = [sample]
            count_file = "logs/raw_counts/{}_R1.count".format(sample)
            with open(count_file) as fh:
                for line in fh:
                    sd.append(int(float(line.strip())))
            with open(count_file.replace("raw_counts/", "filtered_counts/")) as fh:
                for line in fh:
                    sd.append(int(float(line.strip())))
            with open(count_file.replace("raw_counts/", "merged_counts/")) as fh:
                for line in fh:
                    sd.append(int(float(line.strip())))
            sd.append(int(biom_df[sample].sum()))
            sd.append('{:f}'.format(s_idx.loc[sample]))
            count_data.append(sd)
        ss = pd.DataFrame(count_data, columns=["Sample", "Raw", "Filtered", "Merged", "In OTUs", "Shannon"])
        sample_summary_table = ss.to_html(
            index=False, bold_rows=False, classes=["table", "table-bordered"], table_id="summaryTable"
        )
        sample_summary_table = sample_summary_table.replace("\n", "\n" + 10 * " ")

        # sample summary counts plot
        # remove omitted samples from df
        keep = [idx for idx, sample in ss.Sample.items() if sample not in OMITTED]
        ss = ss.iloc[keep]
        data = [go.Bar(x=ss.Sample,
                    y=ss[metric],
                    name=metric,
                    visible=True if i == 0 else "legendonly",
                    ) for i, metric in enumerate(["Raw", "Filtered", "Merged", "In OTUs"])
        ]
        layout = dict(title="Counts Per Sample By Stage",
                      barmode="group",
                      height=700,
                      margin={"b": "auto", "r": "auto"},
                      # xaxis={"tickangle": -60},
                      yaxis={"title": "Count"},
                      showlegend=True,
                      hovermode="compare",
                      bargroupgap=0.1,
                      legend={"orientation": "h",
                              "x": 0,
                              "y":1.06}
                     )
        fig = go.Figure(data=data, layout=layout)
        sample_counts_plot = offline.plot(
            fig,
            **PLOTLY_PARAMS
        )

        # conda environment
        conda_env = ""
        with open(CONDAENV) as fh:
            for i, line in enumerate(fh):
                if i == 0:
                    conda_env += line
                else:
                    conda_env += "    " + line

        report_str = """

        .. raw:: html
            {STYLE}
            <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
            <script src="https://code.jquery.com/jquery-3.3.1.min.js"></script>
            <script src="https://cdn.datatables.net/1.10.18/js/jquery.dataTables.min.js"></script>
            <script src="https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js"></script>
            <script>
            $(document).ready( function () {{
                $('#summaryTable').DataTable();
                $('#otuTable').DataTable( {{
                    "searching": false,
                    "paging": false,
                    "ordering": false,
                    "info": false,
                }} );
            }} );
            </script>

        =============================================================
        hundo_ - Experiment Summary
        =============================================================

        .. contents::
            :backlinks: none
            :depth: 2

        Summary
        -------

        Sequence Counts
        ***************

        In the plots, samples are ordered by diversity (least to most).

        .. raw:: html

            <div style="overflow-x:auto;">
            {sample_summary_table}
            </div>
            {sample_counts_plot}

        Sequence Quality
        ****************

        .. raw:: html

            {quality_plot}
            {merged_reads_quality_plot}

        OTUs
        ****

        .. raw:: html

            <div style="overflow-x:auto;">
            {summary_table}
            </div>

        Taxonomy by Count
        *****************

        .. raw:: html

            {taxonomy_plot}

        Taxonomy by Percent
        *******************

        .. raw:: html

            {relative_abundance_taxonomy_plot}

        Methods
        -------

        ``hundo`` wraps a number of functions and its exact steps are
        dependent upon a given user's configuration.

        Paired-end sequence reads are quality trimmed and can be trimmed of
        adapters and filtered of contaminant sequences using BBDuk2 of the
        BBTools_ package. Passing reads are merged using
        VSEARCH_ then aggregated into a single FASTA file with headers
        describing the origin and count of the sequence.

        Prior to creating clusters, reads are filtered again based on expected
        error rates:

        To create clusters, the aggregated, merged reads are dereplicated to
        remove singletons by default using VSEARCH. Sequences are preclustered
        into centroids using VSEARCH to accelerate chimera filtering. Chimera
        filtering is completed in two steps: *de novo* and then reference
        based. The reference by default is the entire annotation database.
        Following chimera filtering, sequences are placed into clusters using
        distance-based, greedy cluster with VSEARCH based on the allowable
        percent difference of the configuration.

        After OTU sequences have been determined, BLAST_ is used to align
        sequences to the reference database. Reference databases were curated
        by the CREST_ team and hundo incorporates the CREST LCA method.

        Counts are assigned to OTUs using the global alignment method of
        VSEARCH, which outputs the final OTU table as a tab-delimited text
        file. The Biom_ command line tool is used to convert the tab table
        to biom.

        Multiple alignment of sequences is completed using MAFFT_.
        A tree based on the aligned sequences is built using FastTree2_.

        This workflow is built using Snakemake_ and makes use of Bioconda_
        to install its dependencies.

        .. _BBTools: https://sourceforge.net/projects/bbmap/
        .. _VSEARCH: https://github.com/torognes/vsearch
        .. _BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi
        .. _CREST: https://github.com/lanzen/CREST
        .. _Biom: http://biom-format.org/
        .. _MAFFT: https://mafft.cbrc.jp/alignment/software/
        .. _FastTree2: http://www.microbesonline.org/fasttree/
        .. _Snakemake: https://snakemake.readthedocs.io/en/stable/
        .. _Bioconda: https://bioconda.github.io/

        Configuration
        -------------

        ::

            filter_adapters: {params.filter_adapters}
            filter_contaminants: {params.filter_contaminants}
            allowable_kmer_mismatches: {params.allowable_kmer_mismatches}
            reference_kmer_match_length: {params.reference_kmer_match_length}
            allowable_kmer_mismatches: {params.allowable_kmer_mismatches}
            reference_kmer_match_length: {params.reference_kmer_match_length}
            reduced_kmer_min: {params.reduced_kmer_min}
            minimum_passing_read_length: {params.minimum_passing_read_length}
            minimum_base_quality: {params.minimum_base_quality}
            minimum_merge_length: {params.minimum_merge_length}
            allow_merge_stagger: {params.fastq_allowmergestagger}
            max_diffs: {params.fastq_maxdiffs}
            min_overlap: {params.fastq_minovlen}
            maximum_expected_error: {params.maximum_expected_error}
            reference_chimera_filter: {params.reference_chimera_filter}
            minimum_sequence_abundance: {params.minimum_sequence_abundance}
            percent_of_allowable_difference: {params.percent_of_allowable_difference}
            reference_database: {params.reference_database}
            blast_minimum_bitscore: {params.blast_minimum_bitscore}
            blast_top_fraction: {params.blast_top_fraction}
            read_identity_requirement: {params.read_identity_requirement}


        Execution Environment
        ---------------------

        ::

            {conda_env}


        Output Files
        ------------

        Not all files written by the workflow are contained within the
        Downloads section of this page to minimize the size of the this
        document. Other output files are described and are written to the
        results directory.


        Attached
        ********

        The zip archive contains the following files:

        OTU.biom
        ````````

        Biom table with raw counts per sample and their associated
        taxonomic assignment formatted to be compatible with downstream tools
        like phyloseq_.

        OTU.fasta
        `````````

        Representative DNA sequences of each OTU.

        OTU.tree
        ````````

        Newick tree representation of aligned OTU sequences.

        OTU.txt
        ```````

        Tab-delimited text table with columns OTU ID, a column for each sample,
        and taxonomy assignment in the final column as a comma delimited list.


        Other Result Files
        ******************

        Other files that may be needed, but that are not included in the
        attached archive include:

        OTU_aligned.fasta
        `````````````````

        OTU sequences after alignment using MAFFT.

        all-sequences.fasta
        ```````````````````

        Quality-controlled, dereplicated DNA sequences of all samples. The
        header of each record identifies the sample of origin and the count
        resulting from dereplication.

        blast-hits.txt
        ``````````````

        The BLAST assignments per OTU sequence.

        Downloads
        ---------

        .. _hundo: https://github.com/pnnl/hundo
        .. _phyloseq: https://joey711.github.io/phyloseq/

        """
        report(report_str,
            output.html,
            metadata="Author: " + config.get("author", "hundo"),
            stylesheet="",
            archive=input.archive
        )


onsuccess:
    if not config.get("no_temp_declared"):
        shutil.rmtree("tmp", ignore_errors=True)
    print("Protocol [%s] completed successfully." % PROTOCOL_VERSION)
