nemo_curator/__init__.py,sha256=4NZHY4ji7hF4So0yI3YgZv1hPIsrr5DjtvdW2xr6GRw,1726
nemo_curator/_compat.py,sha256=KD0x4kI45L0vTzlt2cV-kmRk0zA-jDQamjwvWurVLJg,967
nemo_curator/log.py,sha256=RoRApb4jGYC2TaIWZy8SnYi4v7CO6r31FYBCO15jCss,2937
nemo_curator/sample_dataframe.py,sha256=553_PeN7UlAjbcIZNDeZxmFmUJgqDB_0O86ML4SwUWA,2835
nemo_curator/datasets/__init__.py,sha256=jb_GcqbWfrisj9nqglR0faq0iaMNh0fhKJ-ZTw71_7k,683
nemo_curator/datasets/doc_dataset.py,sha256=KZ5XzeBGc99wFlWhCftUbf6CK7l5EkfqGCKSawuCiS8,7545
nemo_curator/download/__init__.py,sha256=9lBlOthqw-Qd1P93Hn1VmjAfM57PStMfQ8teCf-V-Sk,1887
nemo_curator/download/arxiv.py,sha256=3KaaccswHti-ZCQ2ev1FxCB0jf58xTCY70BX_L3Sw5g,16228
nemo_curator/download/commoncrawl.py,sha256=r6HIH0XQ9IZssSGoKXCTV9Llef0T6gFL_H3y-5WGDtc,15562
nemo_curator/download/doc_builder.py,sha256=WRYD-vq9lDwhuOyv1o17oqcYkMafFfhhKTBs4HdrkDs,7372
nemo_curator/download/wikipedia.py,sha256=nVSbpmW7QhNg9eGJAA4BOAHLptJuhctllEFaRDfZWLg,31193
nemo_curator/filters/__init__.py,sha256=2oFTNdkjy5pJboO23HpqhiDdjf2gTHcq-jIsbmeDZM4,2525
nemo_curator/filters/classifier_filter.py,sha256=La51m4_HfUTW4vtXku6AtirAzAuqdsxpkedsqpXGkWE,3436
nemo_curator/filters/code.py,sha256=Fx-8hHHL85VxsgcKZ2eX7Cai-5TCqgLOzfnH-_Hc7wE,10712
nemo_curator/filters/doc_filter.py,sha256=9T2yC0F1-g7B9m73_BpPVrqZXH3oSVePQdcoWC5Uak4,2030
nemo_curator/filters/heuristic_filter.py,sha256=bBBxfVALXBdZGmUGCNjPl_xROSybYNzL-Ct7nLHHmd8,19992
nemo_curator/modifiers/__init__.py,sha256=S_fHYgbQOezDHVA65kV-AR2HMzPn9-G482Rc-_vqNN0,976
nemo_curator/modifiers/c4.py,sha256=U25SZoeVOcbMRJ4kRsb8JsI8S6IQv1bBt--Q3MfdqLk,3273
nemo_curator/modifiers/doc_modifier.py,sha256=i6lDpiCybvQP_u6ea2-dC1X8Ym2CTRImPmtQBlxA5HU,936
nemo_curator/modifiers/fasttext.py,sha256=aUg7jTcS1OvG_BKG1PUexzc0JcxEF4jDgeGoy0yx8XI,938
nemo_curator/modifiers/pii_modifier.py,sha256=zZbsc3BkZjSuw-x3VMYUci7CSmdQ2YPjIC_z-fMWeck,3728
nemo_curator/modifiers/unicode_reformatter.py,sha256=nz-vfzMqcOHpoCY6KSkwQ1nLmhA1UxOBGb0B7wgeYHE,846
nemo_curator/modules/__init__.py,sha256=anecAKbLHiZ7jpxRhFOw2Dm5jIXlMyg-TLrcyE84JnQ,2690
nemo_curator/modules/add_id.py,sha256=3D5_xKM4LwCqPieAD8dE2h65GuvOG2z55MBs-a6W_Pw,3341
nemo_curator/modules/config.py,sha256=T4TRqE7JvUSagy5iVgFTpLgX2XEgXNm2S7JWKc1iQig,6546
nemo_curator/modules/dataset_ops.py,sha256=IMUPEE2kCQZduqPyozQwQvLSqA93JULjoOVC7XkMAro,7170
nemo_curator/modules/distributed_data_classifier.py,sha256=FFoniNNy3reqCq4LhobAtVDbHJuTOksPb_TVSCP8Sqs,11799
nemo_curator/modules/exact_dedup.py,sha256=JpDbBsdlvgZ09iPfwjpUFhhXqNeLVw4K0uSAWZJYdLM,6406
nemo_curator/modules/filter.py,sha256=3cp1XK8aN-g7bGXIRI4dWyLU3IvcCwOnp7YyHL7Os6M,4529
nemo_curator/modules/fuzzy_dedup.py,sha256=LBnqP-8HO3l3hQ6suBgurucXIbezh_FBP4qe8e36yYE,57083
nemo_curator/modules/meta.py,sha256=9q-4cd1pZQm4T5D-U_oydoBtavJDxR_ssAZy32KS0pI,825
nemo_curator/modules/modify.py,sha256=-vuGZy_hhIN963Eq3aIhhCVbHOYOV8nle-Ik8VGS8CQ,1406
nemo_curator/modules/semantic_dedup.py,sha256=ope8iYUR9D3ECVPxsDgv3otNfl9AP8mznht8nGXVfqQ,22148
nemo_curator/modules/task.py,sha256=4xpnykbH9qQiyODjwVMlR8GlT6xCEAgk9tWBtPv3OCo,18766
nemo_curator/nemo_run/__init__.py,sha256=SsipNxmMuUKT-P22M7z3i2K10gSybbAuHklWQgxl9Zs,675
nemo_curator/nemo_run/slurm.py,sha256=WySsj9PYVhnMJZAt_4kOqPLyRsnr0JYnizSAL6d4oms,5298
nemo_curator/pii/__init__.py,sha256=NVzEmeQhr33NGGaBYs_GTg-TgkVqEAgU8wYE_BWXMLQ,610
nemo_curator/pii/algorithm.py,sha256=vQyyewAniVQ0xpCPm61IdjQwfZWm9MLcs7etuT96ncI,9843
nemo_curator/pii/constants.py,sha256=FhrU0UmUUHE6LIxDNAP4WQuY3Dl48d0bdBwr5sM-TVg,369
nemo_curator/pii/custom_batch_analyzer_engine.py,sha256=bc5EmJ5BZs07C6Y2Z-rlX6FagC1R29hjxDyvisrbsFY,6668
nemo_curator/pii/custom_nlp_engine.py,sha256=1ZPR8gxCskO7pajgSFgeOz_eAERJjNV12V1Oqqz9rJ8,2686
nemo_curator/pii/recognizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
nemo_curator/pii/recognizers/address_recognizer.py,sha256=Pc_lSuT-J5asHQQ9So5GW-2yoSDoIqoQzSsjtkyg9uY,1828
nemo_curator/scripts/__init__.py,sha256=NVzEmeQhr33NGGaBYs_GTg-TgkVqEAgU8wYE_BWXMLQ,610
nemo_curator/scripts/add_id.py,sha256=vVXRVjEq54-o4eCKbIMnOAvAj8oieGDWLKYvOZpStT8,3969
nemo_curator/scripts/blend_datasets.py,sha256=PYdps-cilBzHPkIMV5HYZCqVpJ2NYwEhoiOttCkV89w,3715
nemo_curator/scripts/domain_classifier_inference.py,sha256=dneASqLn-ghTo7D2tfWueZTbItgkinyv5dqhIQASfW0,4073
nemo_curator/scripts/download_and_extract.py,sha256=8vCS5wL9cvpYG_jzucAEBUL0XY77ru7qG64WvgBkDD0,5814
nemo_curator/scripts/filter_documents.py,sha256=r4rD8pl4ySO7XHFAbgfGb2r0ivv-OB0HdnGJPbaUv1c,10984
nemo_curator/scripts/find_exact_duplicates.py,sha256=78XGrNJB3TaEvBhH169HyyiRI6VIQ4b_NbY1MIicKXQ,4000
nemo_curator/scripts/find_matching_ngrams.py,sha256=jPpnh_vHFeG7sPXwUNw2WL2oyBugaKJ7wBzuZZaxTi8,3335
nemo_curator/scripts/find_pii_and_deidentify.py,sha256=YzA7kog983oJanNaKWeERcDlPAaZO039QY2VzCU0PUM,5536
nemo_curator/scripts/get_common_crawl_urls.py,sha256=jZVTY2GJXipO-WqiTYP830AzuecNtjvRnsAEKesZV0g,3479
nemo_curator/scripts/get_wikipedia_urls.py,sha256=VcBW9tcmVxRHx7ijWFPf_2kmWF6LLkdjZFhLSHA_zwI,1862
nemo_curator/scripts/make_data_shards.py,sha256=QFykRSrrur3KjziOnxkbDX9pppJ06qyi0riLQiqKFmw,2589
nemo_curator/scripts/prepare_fasttext_training_data.py,sha256=-H7gHQnojdRc1EdxCt2NKJRn2giNi533pMujOli8lhA,3521
nemo_curator/scripts/prepare_task_data.py,sha256=9RzN261e1x4b3L7tORJF8maAJh2ZW4hXJJjQkZNQMjs,3034
nemo_curator/scripts/quality_classifier_inference.py,sha256=HxhoPlAUJGa64a7-8KMKD15eMd44WUhDOw5cp1EtKgA,3887
nemo_curator/scripts/remove_matching_ngrams.py,sha256=GEXccxDhUxy2vvjkqO3IF5Q8wW99g4Kn9TqyzKJKwtE,5306
nemo_curator/scripts/separate_by_metadata.py,sha256=1e9NAYOkgidEXoLXb5u7DOyrS2ayExDEHaHtADbuzKc,3732
nemo_curator/scripts/text_cleaning.py,sha256=eha9Gztu3wx3E0QJgRB_2XMoaFbdu8sOJOuQOLeJpDQ,2980
nemo_curator/scripts/train_fasttext.py,sha256=bTPRQVxx2xQ37NH_wci6Kw_JtZvIX5JrZemV7Qbqzmg,6420
nemo_curator/scripts/verify_classification_results.py,sha256=dQ1PfqDnOuh_Hrb8XXqkiLL7KWJH3ghVU3PtiWaii-I,6615
nemo_curator/scripts/fuzzy_deduplication/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py,sha256=jpCFxnmNJ68x9-n4v991EXNPLfBEfjtty6-ERP6Sj4c,5340
nemo_curator/scripts/fuzzy_deduplication/connected_components.py,sha256=f1V605Cp-Py5JBz8yQTLQWp8F_u4H20lTFvQZqyWjF0,2769
nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py,sha256=9nhJ3JBrZQ96mIlvYlS_TNYLz2kTIam98hh3ziBpZuQ,2779
nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py,sha256=3_c0O7YceShfY_kAsFAKctaW3TNbecKYjXZPhIxRpJE,4107
nemo_curator/scripts/fuzzy_deduplication/map_buckets.py,sha256=vxRnUfaQyFk42y9fjP8dgoAMqN9JHuL5Y5ZAuv2TEHs,5716
nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py,sha256=HthsBEILImY_Gm4J1c_5RhgXlsC_RQb2MZ4egmn2Xmg,3875
nemo_curator/scripts/semdedup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
nemo_curator/scripts/semdedup/clustering.py,sha256=Lo3puipI8XBbZy2AYtqsJD5QrCsw8udUApK9Dby-yEc,3857
nemo_curator/scripts/semdedup/compute_embeddings.py,sha256=kPz-7TmA-02sTe5UJkWgo7TMmP9BAZlCoBrEuk5536k,5084
nemo_curator/scripts/semdedup/extract_dedup_data.py,sha256=WKR8xnd036b8qPys07UI-8NRIBBmQV0nu3jp3Gn9XdY,3288
nemo_curator/services/__init__.py,sha256=B3D0AylSmuOOruKFvG_vqV78f9OuR52mDGTWa4npDoQ,973
nemo_curator/services/conversation_formatter.py,sha256=JRMOpMCMrCz2nN4BwHvcHigi_xJTrw6lo6MTlzGN2kQ,1022
nemo_curator/services/model_client.py,sha256=1OEgNbF9vgeso9d64G4cKiGL9ZfxU5Csue0hOOHwi_0,2994
nemo_curator/services/nemo_client.py,sha256=uVN8xhRVsKr7iD20xJ8j-ZfHWzlsOUSnIkvhf1xF0v4,3626
nemo_curator/services/openai_client.py,sha256=z4Kqvs1ItyvhnbPJJB-Idpvys6iNUyr5ai_IayjwQHw,6035
nemo_curator/synthetic/__init__.py,sha256=EskbOiR6d4VCMNgR50IIo-mgjc4SDnaQ2ru-4x7iw3E,3072
nemo_curator/synthetic/async_nemotron.py,sha256=Meg3F9-gTt3q0_n59dQoVMIJVsbhE2xdTDSe7sflroo,80632
nemo_curator/synthetic/error.py,sha256=EWGJNnhanzilxsUrlMKOgcv2fJ_9YjunOFIpLpteOzQ,803
nemo_curator/synthetic/mixtral.py,sha256=CUKR-bmD3qYayxtK08987A7AepAa5jbzvGfoltxzSZ4,1319
nemo_curator/synthetic/nemotron.py,sha256=m1S03olbsjYmNcv9XF5P2xUF5AtF_wk0IshAGtIgMzk,71704
nemo_curator/synthetic/no_format.py,sha256=TjJoa04f_VQTczgAYf67p5XnGB0oKATXFRQ6SXSYTAk,1194
nemo_curator/synthetic/prompts.py,sha256=DjsQRoi4iqQX9FNeWFEVoKC0TAKGFurCqoEt-Gxgwyg,8198
nemo_curator/tasks/__init__.py,sha256=IfaRyarhNVd7icbFyiT_fBnWoY38iCcIqZoFR019Wis,1499
nemo_curator/tasks/downstream_task.py,sha256=krVCvrbuZy9ToNyImYXpxUl03hBCnPVlWcrOXWvCwPY,1968
nemo_curator/tasks/metrics.py,sha256=MgBEr9mC9KEeDRfmW32_k4a06f6ljWveyC9LBLtAG0Y,18930
nemo_curator/utils/__init__.py,sha256=NVzEmeQhr33NGGaBYs_GTg-TgkVqEAgU8wYE_BWXMLQ,610
nemo_curator/utils/config_utils.py,sha256=0_Uy16dEXqqTeqiQsBCRzD8E_wzMYOYvz0_jkC-ZGhU,3598
nemo_curator/utils/constants.py,sha256=Szr4RTF8hFFkTqz5dhh_IbTReZfBXFLlhyokggQ_bvI,3296
nemo_curator/utils/decorators.py,sha256=t0gGx4HhyEJntCk5XAAaoePmoahSyTNjhqtEpGeeHrY,864
nemo_curator/utils/distributed_utils.py,sha256=hORyJQ2CMfLrwIs3NHLTiFeacs2HgfSfSJSWvzhfZhs,21557
nemo_curator/utils/download_utils.py,sha256=i97IsKFyOCWrZa6JuZh84ugabIO9LHkKxIp7fzAR9ic,7458
nemo_curator/utils/file_utils.py,sha256=vcaNOC39I-3cv5sPiwIQkDiSblWND9x1hbi3pp8jr9g,8541
nemo_curator/utils/gpu_utils.py,sha256=3hZWSm_aQdxyIyOjf2vsuHo3QT9qzKNh63ZPzvuTohE,1113
nemo_curator/utils/import_utils.py,sha256=9-nRqRrjYAfdi68avUB0QHuU5VZ5KPvc2J2Ws0D-jus,11869
nemo_curator/utils/module_utils.py,sha256=aaTOspYbMLpGby-gd-wcVKhVv_eBbtYpPd4mjMiRgSM,779
nemo_curator/utils/script_utils.py,sha256=nHSwTBH55SO9MTcftGDjOYMTPririXwcvzzxNCQeOaU,18123
nemo_curator/utils/semdedup_utils.py,sha256=aWnn5ZX9Nkk4G1f9nIHKjRgjdAPxGZ_QOfkvGtxq7ZY,15323
nemo_curator/utils/text_utils.py,sha256=OMFateEXEDtlsIrZov7J_6CXYI00M0AAfXs_m-wxiM0,5920
nemo_curator/utils/fuzzy_dedup_utils/__init__.py,sha256=NVzEmeQhr33NGGaBYs_GTg-TgkVqEAgU8wYE_BWXMLQ,610
nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py,sha256=05MwMpEDBycgB9rESP5yIZSbZDqAW4tb3t2zJLKSEtM,1862
nemo_curator/utils/fuzzy_dedup_utils/io_utils.py,sha256=SZddQUZWUI47b_oRNfhy3myBH1zEHitJ5JaU6qetNbc,6292
nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py,sha256=PQpt26iyJQhBiBdjA1n3fVMM-HYxJrU1YeTGucInxUA,7800
nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py,sha256=XzhhT-dP885ZQW5Y37jCvMz4D0DKS_Lkg7V3UqQaoEg,2588
nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py,sha256=F-FIwICEVM6Lmd90oNw20XaQCbdWXGzQAISQ0DUZsXc,5219
tests/__init__.py,sha256=Gl7X71vLknbUhnkfFuOWJvs_by-CkQjqERM3FQp24Uo,1253
tests/test_add_id.py,sha256=54wB9j5C79YGEuoMUBNW4e4gHHAwXNLJr4fET1-IpDk,4987
tests/test_blend_datasets.py,sha256=X4Xu9jTroU0dbu4WFkvHeVqePxj_ncASJZfLpIulyWA,3878
tests/test_config.py,sha256=5sx7G-ATi0JJ35UeLWWlOLIFHibuvpb3trpMC40GV5o,2716
tests/test_dataset.py,sha256=eD01HC-wwoCqLP4zGNn1N__nv-NRqIAd7ey9HdBGD0I,858
tests/test_download.py,sha256=Yirg1czyXKAahBSyEeoQ_IPR23EzQfq_KzmX8wr_c_w,6426
tests/test_exact_dedup.py,sha256=hBLdCU7qBUxaePygjtT4QhsxK5Pcyf8x5b6mIBr3U_Y,1900
tests/test_filters.py,sha256=R-CTSCsFx8ZZlFLbGcv9K_wjT2ilgBfVwk_cnq1R2og,30378
tests/test_fuzzy_dedup.py,sha256=G8uq_6xfjFt4BebhuEhGPqw7YsMtoHMp1bsmiIhyRBs,16424
tests/test_io.py,sha256=szxF9ODvj390Hc800CQpW1MUhOMV7sHU328vog8a5W0,7819
tests/test_pii_accuracy.py,sha256=vsjKth8M73To0UkMJvhR0WpE6jsZLMZH4zsyXrHa9y0,6376
tests/test_semdedup.py,sha256=mzX_95_xBa6TfpXGsbnEJMoQokehjA_nCi0TD-HwB6Y,2800
tests/test_seperate_by_metadata.py,sha256=FGXzOXXDys7zXzyXkY2JuwMlV0Jp8fs26gOzi6IcfuA,2566
tests/test_shuffle.py,sha256=JaX3Psby4bEIe0-BNOCDvnrHP9rnXk2EbC-MY5x6hQo,7389
tests/test_task_decontamination.py,sha256=RFj0x7rkl_pQEGf3RI4KVB4T4gADjX4Xf3FOBHJnCyA,12881
tests/test_unicode_reformatter.py,sha256=iALTzl-G1ggy9ZjiDWqhw9ub1x3DqZCQro4S1UVR0YA,2082
nemo_curator-0.4.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
nemo_curator-0.4.1.dist-info/METADATA,sha256=QZk-PoGOPZeSi8P3Pug1fMspY6qiddZgk2Glprd2i8A,14192
nemo_curator-0.4.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
nemo_curator-0.4.1.dist-info/entry_points.txt,sha256=H-9bbcmtomXmT-SDGyevoyaCYrRPCGiTyZCeobb2BnY,2410
nemo_curator-0.4.1.dist-info/top_level.txt,sha256=NZpMgId9Qc8gKXBh5ITLq99W4VsNI3MN1cG7f0hm_n0,19
nemo_curator-0.4.1.dist-info/RECORD,,
