#!/usr/bin/env python

# The MIT License (MIT)
#
# Copyright (c) [2012-2022] [Peter Hickey (peter.hickey@gmail.com)]
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

#### Explanation of XM-tag (methylation string) ####
# . for bases not involving cytosines
# X for methylated C in CHG context (was protected)
# x for not methylated C in CHG context (was converted)
# H for methylated C in CHH context (was protected)
# h for not methylated C in CHH context (was converted)
# Z for methylated C in CpG context (was protected)
# z for not methylated C in CpG context (was converted)
# U for methylated C in "unknown" context, e.g. CNN-context, (was protected).
# u for not methylated C in "unknown"-context, e.g. CNN-context, (was converted).

#### Import required modules ####
from __future__ import print_function
import re
import argparse
import sys
import csv
import warnings
import operator
import itertools
import os
import gzip
import bz2
try:
    import pysam # The only required module that is not a part of the Python Standard Library
except ImportError:
    exit_msg = 'ERROR: methtuple requires the Pysam module. Please install it from https://github.com/pysam-developers/pysam before continuing.'
    sys.exit(exit_msg)

from methtuple import *
from methtuple.mtuple import *
from methtuple.funcs import *

#### Command line parser ####
parser = argparse.ArgumentParser(description = 'Extract methylation patterns at m-tuples of methylation loci from the aligned reads of a bisulfite-sequencing experiment. Currently only supports SAM/BAM files created with Bismark.',
    prog = 'methtuple',
    formatter_class = argparse.ArgumentDefaultsHelpFormatter,
    add_help = False,
    usage = '%(prog)s [options] <in.bam>|<in.sam>\nPlease run \'%(prog)s -h\' for a full list of options.',
    epilog = '%(prog)s (v'+__version__+') by Peter Hickey (peter.hickey@gmail.com, https://github.com/PeteHaitch/methtuple/)')
# Input arguments
Input = parser.add_argument_group('Input options')
Input.add_argument('AlignmentFile',
    metavar = '<in.bam>|<in.sam>',
    help = "Input file in BAM or SAM format. Use - to specify STDIN. The header must be included and alignments must have been done using Bismark.")
Input.add_argument('--aligner',
    choices = ['Bismark', 'Bismark_old'],
    default = 'Bismark',
    help = 'The aligner used to generate the SAM/BAM file. Bismark_old refers to Bismark version < 0.8.3')
Input.add_argument('--Phred64',
    action = 'store_true',
    default = False,
    help = 'Quality scores are encoded as Phred64 rather than Phred33')
# Output arguments
Output = parser.add_argument_group('Output options')
Output.add_argument('-o', '--output-prefix',
    metavar = 'PREFIX',
    default = argparse.SUPPRESS,
    help = 'By default, all output files have the same prefix as that of the input file. This will override the prefix of output file names')
Output.add_argument('--sc', '--strand-collapse',
    action = 'store_true',
    default = False,
    help = "Collapse counts across across Watson and Crick strands. Only possible for CG methylation type. The strand is recorded as '*' if this option is selected.")
Output.add_argument('--nfff', '--no-failed-filter-file',
    action = 'store_true',
    default = False,
    help = 'Do not create the file listing the reads that failed to pass to pass the filters and which filter it failed')
Output.add_argument('--gzip',
    action = 'store_true',
    default = False,
    help = 'gzip all output files. --gzip and --bzip2 are mutually exclusive')
Output.add_argument('--bzip2',
    action = 'store_true',
    default = False,
    help = 'bzip2 all output files. --gzip and --bzip2 are mutually exclusive')
# Construction of methylation loci m-tuples
Options = parser.add_argument_group('Construction of methylation loci m-tuples')
Options.add_argument('--mt', '--methylation-type',
    choices = ['CG', 'CHG', 'CHH', 'CNN'],
    default = None,
    action = 'append',
    help = 'The methylation type. Multiple methylation types may be analysed jointly by repeated use of this argument, e.g., --methylation-type CG --methylation-type CHG. The default (\'None\') corresponds to CG')
Options.add_argument('-m',
    type = int,
    metavar = '<int>',
    default = 1,
    help = 'The size of the m-tuples, i.e., the \'m\' in m-tuples')
Options.add_argument('--ac', '--all-combinations',
    action = 'store_true',
    default = False,
    help = "Create all combinations of m-tuples, including non-neighbouring m-tuples. WARNING: This will greatly increase the memory usage, particularly for larger values of -m and when analysing non-CG methylation")
# Read-level filtering arguments
read_filtering = parser.add_argument_group('Filtering of reads', 'Applied before filtering of bases')
read_filtering.add_argument('--id', '--ignore-duplicates',
    action = 'store_true',
    default = False,
    help ='Ignore reads that have been flagged as PCR duplicates by, for example, Picard\'s MarkDuplicates function. More specifically, ignore reads with the 0x400 bit in the FLAG')
read_filtering.add_argument('--mmq', '--min-mapq',
    metavar = '<int>',
    type = int,
    default = 0,
    help = 'Ignore reads with a mapping quality score (mapQ) less than <int>')
read_filtering.add_argument('--of', '--overlap-filter',
    choices = ['sequence_strict', 'sequence', 'XM_strict', 'XM', 'XM_ol', 'quality', 'Bismark'],
    default = 'XM_ol',
    help = "The type of check to be performed (listed roughly from most-to-least stringent): Ignore the read-pair if the sequence in the overlap differs between mates (sequence_strict); Ignore the overlapping region if the sequence in the overlap differs between mates (sequence); Ignore the read-pair if the XM-tag in the overlap differs (XM_strict); Ignore the overlapping region if the XM-tag in the overlap differs between mates (XM); Ignore any positions in the overlapping region where the XM-tags differ between the mates (XM_ol); Use the mate with the higher average quality basecalls in the overlapping region (quality); Use the first mate of each read-pair, i.e., the method used by bismark_methylation_extractor with the --no_overlap flag (Bismark)")
read_filtering.add_argument('--uip', '--use-improper-pairs',
    action = 'store_true',
    default = False,
    help ='Use the improper read-pairs, i.e. don\'t filter them. More specifically, check the 0x2 FLAG bit of each read; the exact definition of an improper read-pair depends on the aligner and alignment parameters')
# Base-level filtering arguments
base_filtering = parser.add_argument_group('Filtering of bases', 'Applied after filtering of reads')
base_filtering.add_argument('--ir1p', '--ignore-read1-positions',
    default = None,
    metavar = 'VALUES',
    help = 'If single-end data, ignore these read positions from all reads. If paired-end data, ignore these read positions from just read_1 of each pair. Multiple values should be comma-delimited, ranges can be specified by use of the hyphen and all positions should use 1-based co-ordinates. For example, 1-5,80,95-100 corresponds to ignoring read-positions 1, 2, 3, 4, 5, 80, 98, 99, 100.')
base_filtering.add_argument('--ir2p', '--ignore-read2-positions',
    default = None,
    metavar = 'VALUES',
    help = 'Ignore these read positions from just read_2 of each pair if paired-end sequencing. Multiple values should be comma-delimited, ranges can be specified by use of the hyphen and all positions should use 1-based co-ordinates. For example, 1-5,80,95-100 corresponds to ignoring read-positions 1, 2, 3, 4, 5, 80, 98, 99, 100.')
base_filtering.add_argument('--mbq', '--min-base-qual',
    metavar = '<int>',
    type = int,
    default = 0,
    help = 'Ignore read positions with a base quality score less than <int>')
# Other
Other = parser.add_argument_group('Other')
Other.add_argument('-v', '--version',
    action = 'version',
    version = '%(prog)s (v'+__version__+')')
Other.add_argument('-h', '--help',
    action = 'help',
    help = 'show this help message and exit')

args = parser.parse_args()

#### Variable initialisations ####
# Set output prefix if unspecified
if not hasattr(args, 'output_prefix'):
    print(args.AlignmentFile)
    if args.AlignmentFile != "-":
        args.output_prefix = os.path.splitext(args.AlignmentFile)[0]
    else:
        exit_msg = "ERROR: Must supply -o PREFIX if input file is STDIN"
        sys.exit(exit_msg)

# Check whether gzip and bzip2 compression are both requested
if args.gzip and args.bzip2:
    exit_msg = "ERROR: Only one of --gzip or --bzip2 can be selected."
    sys.exit(exit_msg)
# Set the Phred quality score offset
if args.Phred64:
    phred_offset = 64
else:
    phred_offset = 33
# Set the "m" in "m-tuple", i.e. the size of the methylation-loci m-tuples
m = args.m
# Iterate over the list of methylation types stored in args.mt and create the proper REGEXP for searching for that methylation pattern in the XM-tag.
# There are 4 methylation types and therefore 2^4 = 16 possible subsets (including the empty set). Each subset must be dealt with separately.
# Also, set the ob_strand_offset variable.
# If the --strand-collapse flag is not set then we don't collapse across strands (i.e. ob_strand_offset = 0). If the flag is set then what happens depends on the value(s) of --methylation-type
# The default is to use the CG methylation type
if args.mt is None:
    args.mt = ['CG']
methylation_type = list(set(args.mt)) # list(set(x)) takes a list, x, and returns the unique elements as a list.
if not args.sc:
    ob_strand_offset = 0
# Firstly, deal with the cases where only a single methylation type is specified
if sorted(methylation_type) == ['CG']:
    methylation_pattern = re.compile(r'[Zz]')
    if args.sc:
        ob_strand_offset = 1
elif sorted(methylation_type) == ['CHG']:
    methylation_pattern = re.compile(r'[Xx]')
    if args.sc:
        exit_msg = 'ERROR: CHG-methylation is not strand-symmetric and therefore the --strand-collapse option cannot be specified.'
        sys.exit(exit_msg)
elif sorted(methylation_type) == ['CHH']:
    methylation_pattern = re.compile(r'[Hh]')
    if args.sc:
        exit_msg = 'ERROR: CHH-methylation is not strand-symmetric and therefore the --strand-collapse option cannot be specified.'
        sys.exit(exit_msg)
elif sorted(methylation_type) == ['CNN']:
    methylation_pattern = re.compile(r'[Uu]')
    if args.sc:
        exit_msg = 'ERROR: CNN-methylation is not strand-symmetric and therefore the --strand-collapse option cannot be specified.'
        sys.exit(exit_msg)
# Secondly, deal with the cases where multiple methylation types are specified or where an incompatible methylation type has been specified
else:
    # If multiple methylation types are specified then the --strand-collapse option cannot be set
    if args.sc:
        exit_msg = 'ERROR: The --strand-collapse option cannot be specified if multiple methylation types are passed via the --methylation-type flag.'
        sys.exit(exit_msg)
    if sorted(methylation_type) == ['CG', 'CHG']:
        methylation_pattern = re.compile(r'[ZzXx]')
    elif sorted(methylation_type) == ['CG', 'CHH']:
        methylation_pattern = re.compile(r'[ZzHh]')
    elif sorted(methylation_type) == ['CG', 'CNN']:
        methylation_pattern = re.compile(r'[ZzUU]')
    elif sorted(methylation_type) == ['CHG', 'CHH']:
        methylation_pattern = re.compile(r'[XxHh]')
    elif sorted(methylation_type) == ['CHG', 'CNN']:
        methylation_pattern = re.compile(r'[XxUu]')
    elif sorted(methylation_type) == ['CHH', 'CNN']:
        methylation_pattern = re.compile(r'[HhUu]')
    elif sorted(methylation_type) == ['CG', 'CHG', 'CHH']:
        methylation_pattern = re.compile(r'[ZzXxHh]')
    elif sorted(methylation_type) == ['CG', 'CHG', 'CNN']:
        methylation_pattern = re.compile(r'[ZzXxUU]')
    elif sorted(methylation_type) == ['CG', 'CHH', 'CNN']:
        methylation_pattern = re.compile(r'[ZzHhUu]')
    elif sorted(methylation_type) == ['CHG', 'CHH', 'CNN']:
        methylation_pattern = re.compile(r'[XxHhUu]')
    elif sorted(methylation_type) == ['CG', 'CHG', 'CHH', 'CNN']:
        methylation_pattern = re.compile(r'[ZzXxHhUu]')
    else:
        exit_msg = "ERROR: --methylation-type must be one or more of 'CG', 'CHG', 'CHH' or 'CNN'. Multiple methylation types may be specified simultaneously, e.g., --methylation-type CG --methylation-type CHG."
        sys.exit(exit_msg)

# Reformat the variable methylation_type to be a single string
methylation_type = '/'.join(sorted(methylation_type))
all_combinations = args.ac
ignore_read_1_pos = make_ignores_list(args.ir1p)
ignore_read_2_pos = make_ignores_list(args.ir2p)
overlap_filter = args.of
min_qual = args.mbq
min_mapping_quality = args.mmq

#### The main program. Loops over the AlignmentFile file line-by-line (i.e. alignedRead-by-alignedRead) and extracts the XM information for each read or read-pair. ####
# Open AlignmentFile file and output files (pysam autodetects SAM vs. BAM)
AlignmentFile = pysam.AlignmentFile(args.AlignmentFile, 'r')
if all_combinations:
  out_prefix = ".".join([args.output_prefix, '_'.join(methylation_type.split('/')), ''.join([str(args.m), 'ac']), "tsv"])
else:
  out_prefix = ".".join([args.output_prefix, '_'.join(methylation_type.split('/')), str(args.m), "tsv"])
hist_prefix = "".join([args.output_prefix, '.', '_'.join(methylation_type.split('/')), "_per_read.hist"])

if args.gzip:
  OUT = gzip.open(".".join([out_prefix, "gz"]), "wb")
  HIST = gzip.open(".".join([hist_prefix, "gz"]), "wb")
elif args.bzip2:
  OUT = bz2.BZ2File(".".join([out_prefix, "bz2"]), "w")
  HIST = bz2.BZ2file(".".join([hist_prefix, "bz2"]), "w")
else:
  OUT = open(out_prefix, "w")
  HIST = open(hist_prefix, "w")
if not args.nfff:
  nfff_prefix = "".join([args.output_prefix, ".reads_that_failed_QC.txt"])
  if args.gzip:
    FAILED_QC = gzip.open(".".join([nfff_prefix, "gz"]), "wb")
  elif args.bzip2:
    FAILED_QC = bz2.BZ2File(".".join([nfff_prefix, "bz2"]), "w")
  else:
    FAILED_QC = open(nfff_prefix, "w")
else:
    FAILED_QC = open(os.devnull, "w")

n_fragment = 0 # The number of DNA fragments. One single-end read contributes one to the count and each half of a read-pair contributes half a count.
n_fragment_skipped_due_to_bad_overlap = 0 # The number of DNA fragments skipped due to the overlapping sequencing not passing the appropriate filter
n_fragment_skipped_due_to_low_mapping_quality = 0 # The number of DNA fragments skipped due to low mapQ
n_fragment_skipped_due_to_duplicate = 0 # The number of DNA fragments skipped due to them being marked as duplicates
n_fragment_skipped_due_to_diff_chr = 0  # The number of DNA fragments skipped due to the mates being aligned to different chromosomes
n_fragment_skipped_due_to_unmapped_read_or_mate = 0 # The number of DNA fragments skipped due to the read or its mate being unmapped
n_fragment_skipped_due_to_improper_pair = 0 # The number of DNA fragments skipped due to the read-pair being improperly paired
n_fragment_skipped_due_to_complicated_cigar = 0 # The number of DNA fragments skipped due to the read or read-pair containing a complicated CIGAR
n_methylation_loci_per_read = {} # Dictionary of the number of methylation loci that passed QC per read
chr_map = dict(list(zip(AlignmentFile.references, list(range(len(AlignmentFile.references)))))) # A dictionary mapping the reference names to their order in the AlignmentFile header
methylation_m_tuples = MTuple(args.output_prefix, m, methylation_type, chr_map) # Initiate the MTuple object

# Print key variable names and command line parameter options to STDOUT
print("methtuple (v"+__version__+")\n")
print('Input SAM/BAM file =', AlignmentFile.filename.decode("utf-8"))
print(''.join(['Output file of ', methylation_type, ' ', str(m), '-tuples = ', OUT.name]))
if args.sc:
    print(''.join(["The counts will be collapsed across Watson and Crick strands and the strand of each ", str(m), "-tuple will be recorded as '*'"]))
if not args.nfff:
    print('Reads that fail to pass QC filters will be written to =', FAILED_QC.name, '\n')
else:
    print('There will be no file of reads that fail to pass QC filters\n')

print(''.join(['Assuming quality scores are Phred', str(phred_offset), '\n']))

if args.aligner == 'Bismark':
    print('Assuming SAM/BAM file was created with Bismark version >= 0.8.3.\n')
elif args.aligner == 'Bismark_old':
    print('Assuming SAM/BAM file was created with Bismark version < 0.8.3.\n')

if args.id:
    print('Ignoring reads marked as PCR duplicates')
if not args.uip:
    print('Ignoring improper read-pairs')
else:
    print('Not filtering out read-pairs that are marked as improperly paired. The definition of a proper read-pair is aligner-specific and the value is set with the 0x2 bit in the SAM flag')
if ignore_read_1_pos:
    print("Ignoring positions ", ignore_read_1_pos, "of each read (if data are single-end) or of each read_1 (if data are paired-end).")
else:
    print("Using all positions of each read (if data are single-end) or of each read_1 (if data are paired-end).")
if ignore_read_2_pos:
    print("Ignoring positions ", ignore_read_2_pos, "of each read_2 (if data are paired-end).")
else:
    print("Using all positions of each read_2 (if data are paired-end).")
print('Ignoring methylation calls with base-quality less than', min_qual)
print('Ignoring reads with mapQ less than', min_mapping_quality)

if overlap_filter == 'sequence_strict':
    print('Ignoring paired-end reads that have overlapping mates if the overlapping sequences are not identical.\n')
elif overlap_filter == 'sequence':
    print('Ignoring any overlapping portion of paired-end reads if the sequences in the overlap are not identical.\n')
elif overlap_filter == 'XM_strict':
    print('Ignoring paired-end reads that have overlapping mates if the XM-tags in the overlap are not identical.\n')
elif overlap_filter == 'XM':
    print('Ignoring any overlapping portion of paired-end reads if the XM-tags in the overlap are not identical.\n')
elif overlap_filter == 'XM_ol':
    print('Ignoring any overlapping positions of paired-end reads where the XM-tags disagree and counting once the remaining overlapping positions.\n')
elif overlap_filter == 'quality':
    print('Using all paired-end reads, even those that have overlapping mates. However, for the overlapping sequence, only the mate with the higher quality scores in the overlapping region shall be used.\n')
elif overlap_filter == 'Bismark':
    print("Using all paired-end reads, even those that have overlapping mates. However, for the overlapping sequence, only read_1 shall be used (as done by bismark_methylation_extractor --no_overlap).\n")

if not all_combinations:
  print(''.join(['Creating ', methylation_type, ' ', str(m), '-tuples']))
  if m > 1:
    print(''.join(['WARNING: ', str(m), '-tuples may still have intervening methylation loci (i.e. NIC > 0). Such ', str(m), '-tuples generally occur in paired-end reads with non-overlapping mates but can also be caused by filtering methylation calls by base quality, read-position, etc. You may wish to post-hoc filter ', str(m), '-tuples with NIC > 0.\n']))
else:
  if m > 1:
    print(''.join(['Creating all possible ', str(m), '-tuples, including non-neighbouring ones.']))
  else:
    print(''.join(['Creating ', methylation_type, ' ', str(m), '-tuples']))

# Flag used in loop
verified_tags = False
# Loop over the AlignmentFile and extract methylation m-tuples
for read in AlignmentFile:
    # Check that mapped reads have the XR-, XG- and XM-tags set. Will only check first mapped read, so assumes that the first mapped read in the AlignmentFile is representative of all mapped reads in the AlignmentFile.
    if not verified_tags:
    # Check XR-tag
        if not read.is_unmapped and not read.has_tag('XR'):
            exit_msg = "ERROR: The first mapped read does not contain an XR-tag. Currently, methtuple can only process SAM/BAM files created by Bismark, sorry."
            sys.exit(exit_msg)
        # Check XG-tag
        if not read.is_unmapped and not read.has_tag('XG'):
            exit_msg = "ERROR: The first mapped read does not contain an XG-tag. Currently, methtuple can only process SAM/BAM files created by Bismark, sorry."
            sys.exit(exit_msg)
        # Check XM-tag
        if not read.is_unmapped and not read.has_tag('XM'):
            exit_msg = "ERROR: The first mapped read does not contain an XM-tag. Currently, methtuple can only process SAM/BAM files created by Bismark, sorry."
            sys.exit(exit_msg)
        else:
            print('Verified that the XR-, XG- and XM-tags are set for the first mapped read.\n')
            verified_tags = True

    # Read is first in a read-pair
    if read.is_paired and read.is_read1:
        # Fix QNAME and FLAG values if data were aligned with Bismark version < 0.8.3
        if args.aligner == 'Bismark_old':
            read = fix_old_bismark(read)
        read_1 = read
        continue
    # Read is second in a read-pair
    elif read.is_paired and read.is_read2:
        # Fix QNAME and FLAG values if data were aligned with Bismark version < 0.8.3
        if args.aligner == 'Bismark_old':
            read = fix_old_bismark(read)
        read_2 = read
        n_fragment += 1
        # Skip duplicate reads if command line parameter --ignore-duplicates is set and read is marked as a duplicate
        if args.id and read.is_duplicate:
            failed_read_msg = '\t'.join([read_1.query_name, 'marked as duplicate\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_duplicate += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip read-pairs if either mate's mapQ is less than min_mapping_quality
        if read_1.mapping_quality < min_mapping_quality or read_2.mapping_quality < min_mapping_quality:
            failed_read_msg = '\t'.join([read_1.query_name, 'mapQ < --min-mapq\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_low_mapping_quality += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip read if either mate is unmapped
        if read_1.is_unmapped or read_2.is_unmapped:
            failed_read_msg = '\t'.join([read_1.query_name, 'read or its mate is unmapped\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_unmapped_read_or_mate += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip improperly paired-reads unless --use-improper-pairs flag is set
        if not args.uip and not read.is_proper_pair:
            failed_read_msg = '\t'.join([read_1.query_name, 'read is not mapped in proper pair\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_improper_pair += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip read if mates are mapped to different chromosomes
        if read_1.reference_id != read_2.reference_id:
            failed_read_msg = '\t'.join([read_1.query_name, 'mates map to different chromosomes\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_diff_chr += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip reads containing complicated CIGAR strings, i.e. anything except 'M', 'I', 'D', 'S' or 'H'.
        if does_read_contain_complicated_cigar(read_1) or does_read_contain_complicated_cigar(read_2):
          if does_read_contain_complicated_cigar(read_1):
            failed_read_msg = '\t'.join([read_1.query_name, 'read has complicated CIGAR', read_1.cigartuples, '\n'])
          elif does_read_contain_complicated_cigar(read_2):
            failed_read_msg = '\t'.join([read_2.query_name, 'read has complicated CIGAR', read_2.cigartuples, '\n'])
          FAILED_QC.write(failed_read_msg)
          n_fragment_skipped_due_to_complicated_cigar += 1
          # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
          read_1 = None
          read_2 = None
          continue
        # Check that read_1 and read_2 have identical read-names. If not, exit because the file probably isn't sorted by queryname.
        if read_1.query_name == read_2.query_name:
            methylation_m_tuples, n_methylation_loci_in_fragment, n_fragment_skipped_due_to_bad_overlap = extract_and_update_methylation_index_from_paired_end_reads(read_1, read_2, AlignmentFile, methylation_m_tuples, m, all_combinations, methylation_type, methylation_pattern, ignore_read_1_pos, ignore_read_2_pos, min_qual, phred_offset, ob_strand_offset, overlap_filter, n_fragment_skipped_due_to_bad_overlap, FAILED_QC)
            # Update the n_methylation_loci_per_read dictionary
            if not n_methylation_loci_in_fragment in n_methylation_loci_per_read:
                n_methylation_loci_per_read[n_methylation_loci_in_fragment] = 0
            n_methylation_loci_per_read[n_methylation_loci_in_fragment] += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
        else:
            exit_msg = ''.join(["ERROR: The name of read_1 is not identical to to that of read_2 for read-pair ", read_1.query_name, read_2.query_name, ". This usually means that the paired-end SAM/BAM file needs to be sorted in queryname order with `samtools sort -n` ", AlignmentFile, " (see README.md for further details). If you believe that your paired-end SAM/BAM file is already sorted by queryname and that this error is incorrect, please file an issue at www.github.com/PeteHaitch/methtuple"])
            sys.exit(exit_msg)
    # Read is single-end
    elif not read.is_paired:
        n_fragment += 1
        # Skip duplicates reads if command line parameter --ignore-duplicates is set and read is marked as a duplicate
        if args.id and read.is_duplicate:
            failed_read_msg = '\t'.join([read.query_name, 'marked as duplicate\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_duplicate += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip read if read's mapQ is less than min_mapping_quality
        if read.mapping_quality < min_mapping_quality:
            failed_read_msg = '\t'.join([read.query_name, 'mapQ < --min-mapq\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_low_mapping_quality += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip read if it is unmapped
        if read.is_unmapped:
            failed_read_msg = '\t'.join([read.query_name, 'read is unmapped\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_unmapped_read_or_mate += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        # Skip reads containing complicated CIGAR strings, i.e. anything except 'M', 'I', 'D', 'S' or 'H'.
        if does_read_contain_complicated_cigar(read):
            '\t'.join([read.query_name, 'read has complicated CIGAR', read.cigartuples, '\n'])
            FAILED_QC.write(failed_read_msg)
            n_fragment_skipped_due_to_complicated_cigar += 1
            # Set both read_1 and read_2 as the None object to ensure that old values don't accidentally carry over to when I process the next read-pair
            read_1 = None
            read_2 = None
            continue
        methylation_m_tuples, n_methylation_loci_in_fragment = extract_and_update_methylation_index_from_single_end_read(read, AlignmentFile, methylation_m_tuples, m, all_combinations, methylation_type, methylation_pattern, ignore_read_1_pos, min_qual, phred_offset, ob_strand_offset)
        if not n_methylation_loci_in_fragment in n_methylation_loci_per_read:
            n_methylation_loci_per_read[n_methylation_loci_in_fragment] = 0

        n_methylation_loci_per_read[n_methylation_loci_in_fragment] += 1
    # Read is neither single-end nor a mate from a read-pair. This shouldn't happen.
    else:
        exit_msg = ''.join(['ERROR: Read ', read.query_name, ' is missing the 0x01, 0x40 or 0x80 FLAG bit. This should never happen. Please file an issue at www.github.com/PeteHaitch/methtuple describing the error.'])
        sys.exit(exit_msg)

# Write results to disk
print(''.join(['Finished extracting ',  methylation_type, ' ', str(m), '-tuples.']))
print('Now writing output to', OUT.name, '...\n')
write_methylation_m_tuples_to_file(methylation_m_tuples, OUT)

# Print some summary information to STDOUT
print('Summary of the number of DNA fragments processed by methtuple')
print(''.join(['Number of DNA fragments in file = ', str(n_fragment)]))
print(''.join(['Number of DNA fragments skipped due to being marked as PCR duplicates = ', str(n_fragment_skipped_due_to_duplicate)]))
print(''.join(['Number of DNA fragments skipped due to failing the --min-mapq filter = ', str(n_fragment_skipped_due_to_low_mapping_quality)]))
print(''.join(['Number of DNA fragments skipped due to the read or its mate being unmapped = ' , str(n_fragment_skipped_due_to_unmapped_read_or_mate)]))
print(''.join(['Number of DNA fragments skipped due to the read-pair being improperly paired = ', str(n_fragment_skipped_due_to_improper_pair)]))
print(''.join(['Number of DNA fragments skipped due to mates mapping to different chromosomes = ', str(n_fragment_skipped_due_to_diff_chr)]))
print(''.join(['Number of DNA fragments skipped due to a complicated CIGAR in the read or its mate = ', str(n_fragment_skipped_due_to_complicated_cigar)]))
if overlap_filter == "sequence_strict" or overlap_filter == "XM_strict":
  print(''.join(['Number of DNA fragments skipped due to failing the --overlap-filter ', overlap_filter, ' filter = ', str(n_fragment_skipped_due_to_bad_overlap)]))
n_informative_fragments = 0
for k, v in n_methylation_loci_per_read.items():
    if k >= m:
        n_informative_fragments += v
print(''.join(['Number of DNA fragments informative for ', methylation_type, ' ', str(m), '-tuples = ', str(n_informative_fragments), ' (', str(round(n_informative_fragments / float(n_fragment) * 100, 1)), '% of total fragments)\n']))

# Write histogram to HIST with the number of methylation loci per DNA fragment that passed QC filters
print('Writing histogram with the number of', methylation_type, 'methylation loci per DNA fragment that passed QC filters to', HIST.name, '...')
HIST.write('n\tcount\n')
for k, v in iter(sorted(n_methylation_loci_per_read.items())):
    HIST.write(''.join([str(k), '\t', str(v), '\n']))

# Close all files
AlignmentFile.close()
OUT.close()
HIST.close()
FAILED_QC.close()
