#!/usr/bin/python3
#
# Main program for extracting a dictionary from wiktionary.  This has
# mostly been used with enwiktionary, but should be usable with other
# wiktionaries as well.
#
# Copyright (c) 2018 Tatu Ylonen.  See LICENSE.

import os
import re
import sys
import json
import hashlib
import argparse
import wiktextract
from wiktextract import wiktlangs


# Pages whose titles have any of these prefixes are ignored.
ignore_prefixes = set(["Index", "Help", "MediaWiki", "Citations",
                       "Reconstruction", "Concordance",
                       "Rhymes", "Thread",
                       "Summary", "File",
                       "Transwiki",
])

# Pages with these prefixes are captured.
recognized_prefixes = set(["Category", "Appendix", "Wiktionary",
                           "Thesaurus", "Module", "Template"])


def capture_page(title, text, pages_dir):
    """Checks if the page needs special handling (and maybe saving).
    Returns True if the page should be processed normally as a
    dictionary entry."""
    assert isinstance(title, str)
    assert isinstance(text, str)
    assert pages_dir is None or isinstance(pages_dir, str)
    analyze = False
    m = re.match(r"^([A-Z][a-z][-a-zA-Z0-9_]+):(.+)$", title)
    if not m:
        if len(title) > 100:
            h = hashlib.sha256()
            h.update(title.encode("utf-8"))
            title = title[:100] + "-" + h.hexdigest()[:10]
        title = "Words:" + title[:2] + "/" + title
        analyze = True
    else:
        prefix, tail = m.groups()
        if prefix in ignore_prefixes:
            return False
        if prefix not in recognized_prefixes:
            print("UNRECOGNIZED PREFIX", title)
        if prefix == "Category":
            m = re.match(r"^(Category:[^_ :]+)[_ :]*(.*)", title)
            if m:
                title = m.group(1) + ":" + m.group(2)

    if pages_dir is not None:
        title = re.sub(r"[^-\w_.:/]", "_", title)
        title = re.sub(r":", "/", title)
        path = pages_dir + "/" + title + ".txt"
        path = re.sub(r"/\.+", "/", path)
        path = re.sub(r"//+", "/", path)
        dirpath = os.path.dirname(path)
        os.makedirs(dirpath, exist_ok=True)
        with open(path, "w") as f:
            f.write(text)

    return analyze


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Multilingual Wiktionary data extractor")
    parser.add_argument("path", type=str,
                        help="Input file (.../enwiktionary-<date>-"
                        "pages-articles.xml[.bz2])")
    parser.add_argument("--out", type=str, default=None,
                        help="Path where to write output (- for stdout)")
    parser.add_argument("--translations", action="store_true", default=False,
                        help="Capture translations")
    parser.add_argument("--language", type=str, action="append", default=[],
                        help="Language to capture (can specify multiple tiems, "
                        "defaults to English and Translingual)")
    parser.add_argument("--list-languages", action="store_true", default=False,
                        help="Print list of supported languages")
    parser.add_argument("--pages-dir", type=str, default=None,
                        help="Directory under which to store all pages")
    parser.add_argument("--pronunciation", action="store_true", default=False,
                        help="Capture pronunciation information")
    parser.add_argument("--statistics", action="store_true", default=False,
                        help="Print statistics")
    args = parser.parse_args()

    # If --list-languages has been specified, just print the list of supported
    # languages
    if args.list_languages:
        print("Supported languages:")
        for lang in wiktlangs.languages:
            print("    {}".format(lang))
        sys.exit(0)

    # Default to English and Translingual if language not specified.
    if not args.language:
        args.language = ["English", "Translingual"]
    else:
        for x in args.language:
            if x not in wiktlangs.languages:
                print("Invalid language:", x)
                sys.exit(1)

    # Open output file.
    out_path = args.out
    if out_path and out_path != "-":
        out_tmp_path = out_path + ".tmp"
        out_f = open(out_tmp_path, "w", buffering=1024*1024)
    else:
        out_tmp_path = out_path
        out_f = sys.stdout

    word_count = 0

    def word_cb(data):
        global word_count
        word_count += 1
        out_f.write(json.dumps(data))
        out_f.write("\n")
        if not out_path or out_path == "-":
            out_f.flush()

    def capture_cb(title, text):
        return capture_page(title, text, args.pages_dir)

    try:
        ctx = wiktextract.parse_wiktionary(
            args.path,
            word_cb,
            capture_cb,
            capture_languages=args.language,
            capture_pronunciation=args.pronunciation,
            capture_translations=args.translations)
    finally:
        if out_path and out_path != "-":
            out_f.close()

    if out_path != out_tmp_path:
        try:
            os.remove(out_path)
        except FileNotFoundError:
            pass
        os.rename(out_tmp_path, out_path)

    if args.statistics:
        print("")
        print("LANGUAGE COUNTS")
        for k, cnt in sorted(ctx.language_counts.items(),
                             key=lambda x: -x[1]):
            print("  {:>7d} {}".format(cnt, k))
            if cnt < 1000:
                break
        print("  ...")
        print("")

        print("")
        print("POS HEADER USAGE")
        for k, cnt in sorted(ctx.pos_counts.items(),
                             key=lambda x: -x[1]):
            print("  {:>7d} {}".format(cnt, k))

        print("")
        print("POS SUBSECTION HEADER USAGE")
        for k, cnt in sorted(ctx.section_counts.items(),
                             key=lambda x: -x[1]):
            print("  {:>7d} {}".format(cnt, k))

        print("")
        print("{} WORDS CAPTURED".format(word_count))
