.gitignore
.pylintrc
LICENSE
README.md
pyproject.toml
requirements.txt
setup.cfg
setup.py
.github/workflows/ci.yml
bin/opusfilter
bin/opusfilter-cmd
bin/opusfilter-diagram
bin/opusfilter-duplicates
bin/opusfilter-scores
bin/opusfilter-test
docs/CHANGELOG.md
docs/CONTRIBUTING.md
docs/command_line_tools.md
docs/conf.py
docs/index.rst
docs/installation.md
docs/references.bib
docs/references.rst
docs/requirements.txt
docs/usage.md
docs/filters/alignment_model_filters.md
docs/filters/custom_filters.md
docs/filters/language_model_filters.md
docs/filters/length_filters.md
docs/filters/script_and_language_identification_filters.md
docs/filters/sentence_embedding_filters.md
docs/filters/special_character_and_similarity_filters.md
docs/functions/downloading_and_selecting_data.md
docs/functions/filtering_and_scoring.md
docs/functions/preprocessing_text.md
docs/functions/training_and_using_classifiers.md
docs/functions/training_language_and_alignment_models.md
docs/functions/using_score_files.md
docs/preprocessors/bpe_segmentation.md
docs/preprocessors/custom_preprocessors.md
docs/preprocessors/detokenizer.md
docs/preprocessors/monolingual_sentence_splitter.md
docs/preprocessors/morfessor_segmentation.md
docs/preprocessors/reg_exp_sub.md
docs/preprocessors/tokenizer.md
docs/preprocessors/whitespaceNormalizer.md
example_configs/README.md
example_configs/qed_lm_langid.yaml
example_configs/paracrawl_fi-en/create_ce_sets.yaml
example_configs/paracrawl_fi-en/create_domain_sets.yaml
example_configs/paracrawl_fi-en/create_roc_auc_sets.yaml
example_configs/paracrawl_fi-en/devset_100_1_labels.jsonl
example_configs/paracrawl_fi-en/devset_100_2_labels.jsonl
example_configs/paracrawl_fi-en/prepare_data.yaml
opusfilter/__init__.py
opusfilter/classifier.py
opusfilter/embeddings.py
opusfilter/filters.py
opusfilter/lm.py
opusfilter/opusfilter.py
opusfilter/pipeline.py
opusfilter/preprocessors.py
opusfilter/segment_hash.py
opusfilter/subwords.py
opusfilter/tokenization.py
opusfilter/util.py
opusfilter/word_alignment.py
opusfilter.egg-info/PKG-INFO
opusfilter.egg-info/SOURCES.txt
opusfilter.egg-info/dependency_links.txt
opusfilter.egg-info/requires.txt
opusfilter.egg-info/top_level.txt
tests/test_classifier.py
tests/test_embeddings.py
tests/test_filter_pipeline.py
tests/test_filters.py
tests/test_lm_filter.py
tests/test_opusfilter.py
tests/test_preprocessors.py
tests/test_segment_hash.py
tests/test_subwords.py
tests/test_tokenization.py
tests/test_util.py
tests/test_wordalign_filter.py
tests/test_yaml.py