core

This module contains all the core functions used in the library.

source

Pipeline

 Pipeline (datasources)

A pipeline is a collection of datasources and their associated transformations to be run.

Details
datasources The datasources to be run
show_doc(Pipeline.run)

source

Pipeline.run

 Pipeline.run (global_filters=[], global_cleaners=[],
               cleaning_first=False, globals_first=False, dry_run=False,
               num_proc=2)

Run the pipeline.

Type Default Details
global_filters list [] Filters to be run at the dataset level rather than the example level
global_cleaners list [] Cleaners to be run at the dataset level rather than the example level
cleaning_first bool False Whether to run the cleaning transformations first
globals_first bool False Whether to run the global transformations first
dry_run bool False Whether to run the pipeline or only calculate the various criteria and add as a column
num_proc int 2 Number of processes to use
# test dry run
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [],
        "cleaners": [],
    },
    # ...
]

pipeline = Pipeline(datasources)
pipeline.run(dry_run=True, global_filters=[minhash_dedup])

assert len(ds) == len(pipeline.datasources[0]["dataset"])
assert "duplicate" in pipeline.datasources[0]["dataset"].features
assert "meta_data" in pipeline.datasources[0]["dataset"].features
assert "__id__" in pipeline.datasources[0]["dataset"].features
[12/02/22 05:21:47] INFO     Original dataset size: 18014                                           2007760668.py:3
                    INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running global filter: minhash_dedup                                  4230721344.py:94
                                                                                                                                                                                                                                                                                                                                                                                                
pip install -e ".[dev]"
# test the ability to skip global filters
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
ds_1 = load_dataset("lcama/elon-tweets", split="train")

datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [],
        "cleaners": [],
        "skip_global": False,
    },
    {
        "dataset": ds_1,
        "name": "elon",
        "columns": ["text"],
        "filters": [],
        "cleaners": [],
        "skip_global": False,
    },
    # ...
]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=[minhash_dedup])
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(ds) > len(pipeline.datasources[0]["dataset"])
assert len(ds_1) > len(pipeline.datasources[1]["dataset"])
Downloading and preparing dataset None/None (download: 133.54 KiB, generated: 201.20 KiB, post-processed: Unknown size, total: 334.74 KiB) to /home/nathan/.cache/huggingface/datasets/lcama___parquet/lcama--elon-tweets-dcde4a32936ef7f2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...
Dataset parquet downloaded and prepared to /home/nathan/.cache/huggingface/datasets/lcama___parquet/lcama--elon-tweets-dcde4a32936ef7f2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
                                                                                                                                                                                                                                                                                                                                                                                                                                                                
[12/02/22 05:23:00] INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running datasource: elon                                              4230721344.py:43
                    INFO     Running global filter: minhash_dedup                                  4230721344.py:94
[12/02/22 05:23:54] INFO     Final dataset size: 10560                                             2684403668.py:26
len(ds), len(ds_1), len(pipeline.datasources[0]["dataset"]), len(pipeline.datasources[1]["dataset"])
(18014, 1601, 10560, 1597)
def formatted(text):
    """
    Format a question and answer pair for training a decoder-only model.
    """
    formatted_example = f"""
    This tweet was tweeted by Elon Musk.
    Here it is: {text}
    """
    return formatted_example

fake_qa = ds_1.map(
    lambda x: {"text": formatted(x["text"])}
)
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [],
        "cleaners": [],
        "skip_global": False,
    },
    {
        "dataset": fake_qa,
        "name": "elon_qa",
        "columns": ["text"],
        "filters": [],
        "cleaners": [],
        "skip_global": False,
    },
    # ...
]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=[minhash_dedup])
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(ds) > len(pipeline.datasources[0]["dataset"])
assert len(fake_qa) > len(pipeline.datasources[1]["dataset"])
[12/02/22 05:24:00] INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running datasource: elon_qa                                           4230721344.py:43
                    INFO     Running global filter: minhash_dedup                                  4230721344.py:94
                                                                                                                                                                                                                                                                                                                                                                                                                                                                
[12/02/22 05:24:55] INFO     Final dataset size: 10560                                             1197017855.py:35
len(ds_1), len(fake_qa), len(pipeline.datasources[0]["dataset"]), len(pipeline.datasources[1]["dataset"])
(1601, 1601, 10560, 1589)
# test dry run
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

pipeline = Pipeline(datasources)
pipeline.run(dry_run=True, global_filters=[minhash_dedup])

assert len(ds) == len(pipeline.datasources[0]["dataset"])
assert "check_char_repetition_criteria" in pipeline.datasources[0]["dataset"].features
assert "check_flagged_words_criteria" in pipeline.datasources[0]["dataset"].features
assert "duplicate" in pipeline.datasources[0]["dataset"].features
assert "meta_data" in pipeline.datasources[0]["dataset"].features
assert "__id__" in pipeline.datasources[0]["dataset"].features
[12/02/22 05:26:34] INFO     Original dataset size: 18014                                           4207204728.py:3
                    INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running filter: check_char_repetition on text                         4230721344.py:17
                    INFO     Running in dry-run mode                                               4230721344.py:19
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
                    INFO     Running filter: check_flagged_words on text                           4230721344.py:17
                    INFO     Running in dry-run mode                                               4230721344.py:19
[12/02/22 05:26:35] INFO     Running cleaner: remove_empty_lines on text                           4230721344.py:69
[12/02/22 05:26:36] INFO     Running cleaner: normalize_whitespace on text                         4230721344.py:69
[12/02/22 05:26:37] INFO     Running global filter: minhash_dedup                                  4230721344.py:94
# test dry run with partials
from functools import partial

ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
check_char_repetition_p = partial(check_char_repetition, char_repetition_len=3)
check_char_repetition_p.__name__ = "check_char_repetition"
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [check_char_repetition_p, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

pipeline = Pipeline(datasources)
pipeline.run(dry_run=True)

assert len(ds) == len(pipeline.datasources[0]["dataset"])
assert "check_char_repetition_criteria" in pipeline.datasources[0]["dataset"].features
assert "check_flagged_words_criteria" in pipeline.datasources[0]["dataset"].features
Downloading and preparing dataset wikitext/wikitext-103-v1 to /home/runner/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...
Dataset wikitext downloaded and prepared to /home/runner/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.
[06/16/23 22:16:16] INFO     Original dataset   <ipython-input-1-b7aa22f09144>:5
                             size: 18014                                        
                    INFO     Running           <ipython-input-1-afbdab7dc7e6>:43
                             datasource:                                        
                             wikitext                                           
                    INFO     Running filter:   <ipython-input-1-afbdab7dc7e6>:16
                             check_char_repeti                                  
                             tion on text                                       
                    INFO     Running in        <ipython-input-1-afbdab7dc7e6>:18
                             dry-run mode                                       
[06/16/23 22:16:18] INFO     Running filter:   <ipython-input-1-afbdab7dc7e6>:16
                             check_flagged_wor                                  
                             ds on text                                         
                    INFO     Running in        <ipython-input-1-afbdab7dc7e6>:18
                             dry-run mode                                       
[06/16/23 22:16:19] INFO     Running cleaner:  <ipython-input-1-afbdab7dc7e6>:71
                             remove_empty_line                                  
                             s on text                                          
                    INFO     Running cleaner:  <ipython-input-1-afbdab7dc7e6>:71
                             normalize_whitesp                                  
                             ace on text                                        
Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]Downloading builder script: 100%|##########| 8.48k/8.48k [00:00<00:00, 23.5MB/s]
Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]Downloading metadata: 100%|##########| 6.84k/6.84k [00:00<00:00, 21.9MB/s]
Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]Downloading readme: 100%|##########| 9.25k/9.25k [00:00<00:00, 26.6MB/s]
Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]Downloading data:   0%|          | 121k/190M [00:00<02:53, 1.10MB/s]Downloading data:   1%|          | 1.11M/190M [00:00<00:32, 5.74MB/s]Downloading data:   3%|3         | 6.35M/190M [00:00<00:07, 24.8MB/s]Downloading data:   6%|6         | 12.3M/190M [00:00<00:04, 37.9MB/s]Downloading data:   9%|9         | 17.8M/190M [00:00<00:03, 43.6MB/s]Downloading data:  12%|#2        | 23.2M/190M [00:00<00:03, 47.0MB/s]Downloading data:  15%|#4        | 28.5M/190M [00:00<00:03, 47.2MB/s]Downloading data:  18%|#8        | 34.7M/190M [00:00<00:03, 51.8MB/s]Downloading data:  21%|##1       | 40.6M/190M [00:00<00:02, 54.0MB/s]Downloading data:  24%|##4       | 46.0M/190M [00:01<00:02, 50.2MB/s]Downloading data:  28%|##7       | 52.3M/190M [00:01<00:02, 53.8MB/s]Downloading data:  31%|###       | 58.1M/190M [00:01<00:02, 54.8MB/s]Downloading data:  33%|###3      | 63.6M/190M [00:01<00:02, 55.0MB/s]Downloading data:  36%|###6      | 69.1M/190M [00:01<00:02, 50.8MB/s]Downloading data:  39%|###9      | 74.3M/190M [00:01<00:02, 50.5MB/s]Downloading data:  42%|####1     | 79.4M/190M [00:01<00:02, 46.5MB/s]Downloading data:  44%|####4     | 84.2M/190M [00:01<00:02, 45.2MB/s]Downloading data:  47%|####6     | 88.8M/190M [00:01<00:02, 44.1MB/s]Downloading data:  49%|####8     | 93.2M/190M [00:02<00:02, 43.3MB/s]Downloading data:  51%|#####1    | 97.6M/190M [00:02<00:02, 42.2MB/s]Downloading data:  54%|#####3    | 102M/190M [00:02<00:02, 41.6MB/s] Downloading data:  56%|#####5    | 106M/190M [00:02<00:02, 39.0MB/s]Downloading data:  58%|#####7    | 110M/190M [00:02<00:02, 36.5MB/s]Downloading data:  60%|#####9    | 114M/190M [00:02<00:02, 33.9MB/s]Downloading data:  62%|######1   | 117M/190M [00:02<00:02, 33.1MB/s]Downloading data:  63%|######3   | 120M/190M [00:02<00:02, 32.4MB/s]Downloading data:  65%|######5   | 124M/190M [00:02<00:02, 32.2MB/s]Downloading data:  67%|######6   | 127M/190M [00:03<00:01, 32.6MB/s]Downloading data:  69%|######8   | 130M/190M [00:03<00:01, 32.5MB/s]Downloading data:  70%|#######   | 134M/190M [00:03<00:01, 31.9MB/s]Downloading data:  72%|#######1  | 137M/190M [00:03<00:01, 31.7MB/s]Downloading data:  74%|#######3  | 140M/190M [00:03<00:01, 32.0MB/s]Downloading data:  76%|#######5  | 144M/190M [00:03<00:01, 32.9MB/s]Downloading data:  77%|#######7  | 147M/190M [00:03<00:01, 33.2MB/s]Downloading data:  79%|#######9  | 150M/190M [00:03<00:01, 32.8MB/s]Downloading data:  81%|########  | 154M/190M [00:03<00:01, 33.0MB/s]Downloading data:  83%|########2 | 157M/190M [00:04<00:00, 33.3MB/s]Downloading data:  85%|########4 | 161M/190M [00:04<00:00, 34.1MB/s]Downloading data:  86%|########6 | 164M/190M [00:04<00:00, 33.9MB/s]Downloading data:  88%|########8 | 168M/190M [00:04<00:00, 33.5MB/s]Downloading data:  90%|########9 | 171M/190M [00:04<00:00, 33.6MB/s]Downloading data:  92%|#########1| 175M/190M [00:04<00:00, 34.3MB/s]Downloading data:  94%|#########3| 178M/190M [00:04<00:00, 32.1MB/s]Downloading data:  95%|#########5| 181M/190M [00:04<00:00, 30.5MB/s]Downloading data:  97%|#########6| 184M/190M [00:04<00:00, 28.5MB/s]Downloading data:  98%|#########8| 187M/190M [00:04<00:00, 27.3MB/s]Downloading data: 100%|#########9| 190M/190M [00:05<00:00, 26.7MB/s]Downloading data: 100%|##########| 190M/190M [00:05<00:00, 37.2MB/s]
Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]                                                                      Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]Generating train split:   0%|          | 6996/1801350 [00:00<00:25, 69841.18 examples/s]Generating train split:   1%|          | 14000/1801350 [00:00<00:25, 69597.68 examples/s]Generating train split:   1%|1         | 21020/1801350 [00:00<00:25, 69867.70 examples/s]Generating train split:   2%|1         | 28050/1801350 [00:00<00:25, 70034.57 examples/s]Generating train split:   2%|2         | 38579/1801350 [00:00<00:25, 70104.47 examples/s]Generating train split:   3%|2         | 45605/1801350 [00:00<00:25, 70148.46 examples/s]Generating train split:   3%|2         | 52729/1801350 [00:00<00:24, 70475.42 examples/s]Generating train split:   4%|3         | 63247/1801350 [00:00<00:24, 70330.83 examples/s]Generating train split:   4%|3         | 70285/1801350 [00:01<00:24, 70341.83 examples/s]Generating train split:   4%|4         | 77350/1801350 [00:01<00:24, 70424.90 examples/s]Generating train split:   5%|4         | 87885/1801350 [00:01<00:24, 70351.46 examples/s]Generating train split:   5%|5         | 98384/1801350 [00:01<00:24, 70221.82 examples/s]Generating train split:   6%|5         | 105431/1801350 [00:01<00:24, 70282.23 examples/s]Generating train split:   6%|6         | 112464/1801350 [00:01<00:24, 70293.78 examples/s]Generating train split:   7%|6         | 122970/1801350 [00:01<00:23, 70197.46 examples/s]Generating train split:   7%|7         | 130000/1801350 [00:01<00:23, 70104.82 examples/s]Generating train split:   8%|7         | 137064/1801350 [00:01<00:23, 70246.30 examples/s]Generating train split:   8%|8         | 147562/1801350 [00:02<00:23, 70148.67 examples/s]Generating train split:   9%|8         | 158085/1801350 [00:02<00:23, 70114.15 examples/s]Generating train split:   9%|9         | 165122/1801350 [00:02<00:23, 70175.12 examples/s]Generating train split:  10%|9         | 175697/1801350 [00:02<00:23, 70282.28 examples/s]Generating train split:  10%|#         | 186134/1801350 [00:02<00:23, 70047.75 examples/s]Generating train split:  11%|#         | 193186/1801350 [00:02<00:22, 70158.49 examples/s]Generating train split:  11%|#1        | 200208/1801350 [00:02<00:22, 70171.36 examples/s]Generating train split:  12%|#1        | 210683/1801350 [00:03<00:22, 70048.64 examples/s]Generating train split:  12%|#2        | 221121/1801350 [00:03<00:22, 69891.15 examples/s]Generating train split:  13%|#2        | 228127/1801350 [00:03<00:22, 69928.92 examples/s]Generating train split:  13%|#3        | 235174/1801350 [00:03<00:22, 70064.93 examples/s]Generating train split:  13%|#3        | 243163/1801350 [00:03<00:25, 61640.40 examples/s]Generating train split:  14%|#3        | 250235/1801350 [00:03<00:24, 63865.24 examples/s]Generating train split:  14%|#4        | 257232/1801350 [00:03<00:23, 65447.21 examples/s]Generating train split:  15%|#4        | 264232/1801350 [00:03<00:23, 66675.85 examples/s]Generating train split:  15%|#5        | 271231/1801350 [00:03<00:22, 67596.72 examples/s]Generating train split:  15%|#5        | 278266/1801350 [00:04<00:22, 68377.92 examples/s]Generating train split:  16%|#5        | 285253/1801350 [00:04<00:22, 68808.05 examples/s]Generating train split:  16%|#6        | 292224/1801350 [00:04<00:21, 69069.23 examples/s]Generating train split:  17%|#6        | 299275/1801350 [00:04<00:21, 69489.90 examples/s]Generating train split:  17%|#7        | 306297/1801350 [00:04<00:21, 69703.40 examples/s]Generating train split:  17%|#7        | 313344/1801350 [00:04<00:21, 69929.99 examples/s]Generating train split:  18%|#7        | 323832/1801350 [00:04<00:21, 69922.28 examples/s]Generating train split:  18%|#8        | 330845/1801350 [00:04<00:21, 69977.74 examples/s]Generating train split:  19%|#8        | 337871/1801350 [00:04<00:20, 70054.45 examples/s]Generating train split:  19%|#9        | 348233/1801350 [00:05<00:20, 69685.37 examples/s]Generating train split:  20%|#9        | 358689/1801350 [00:05<00:20, 69690.23 examples/s]Generating train split:  20%|##        | 365747/1801350 [00:05<00:20, 69909.92 examples/s]Generating train split:  21%|##        | 372810/1801350 [00:05<00:20, 70094.91 examples/s]Generating train split:  21%|##1       | 379860/1801350 [00:05<00:20, 70201.53 examples/s]Generating train split:  21%|##1       | 386950/1801350 [00:05<00:20, 70396.05 examples/s]Generating train split:  22%|##1       | 394000/1801350 [00:05<00:20, 70214.28 examples/s]Generating train split:  22%|##2       | 401050/1801350 [00:05<00:19, 70294.70 examples/s]Generating train split:  23%|##2       | 411597/1801350 [00:05<00:19, 70295.83 examples/s]Generating train split:  23%|##3       | 422000/1801350 [00:06<00:19, 69858.97 examples/s]Generating train split:  24%|##3       | 429008/1801350 [00:06<00:19, 69912.37 examples/s]Generating train split:  24%|##4       | 436035/1801350 [00:06<00:19, 70001.73 examples/s]Generating train split:  25%|##4       | 443089/1801350 [00:06<00:19, 70146.41 examples/s]Generating train split:  25%|##5       | 453632/1801350 [00:06<00:19, 70194.83 examples/s]Generating train split:  26%|##5       | 460669/1801350 [00:06<00:19, 70237.83 examples/s]Generating train split:  26%|##6       | 471149/1801350 [00:06<00:18, 70100.75 examples/s]Generating train split:  27%|##6       | 478190/1801350 [00:06<00:18, 70178.13 examples/s]Generating train split:  27%|##6       | 485244/1801350 [00:06<00:18, 70273.22 examples/s]Generating train split:  27%|##7       | 492297/1801350 [00:07<00:18, 70340.81 examples/s]Generating train split:  28%|##7       | 499349/1801350 [00:07<00:18, 70389.43 examples/s]Generating train split:  28%|##8       | 509906/1801350 [00:07<00:18, 70383.07 examples/s]Generating train split:  29%|##8       | 520416/1801350 [00:07<00:18, 70267.97 examples/s]Generating train split:  29%|##9       | 530980/1801350 [00:07<00:18, 70317.33 examples/s]Generating train split:  30%|###       | 541414/1801350 [00:07<00:17, 70070.76 examples/s]Generating train split:  30%|###       | 548456/1801350 [00:07<00:17, 70150.06 examples/s]Generating train split:  31%|###       | 555552/1801350 [00:07<00:17, 70190.12 examples/s]Generating train split:  31%|###1      | 562587/1801350 [00:08<00:17, 70228.29 examples/s]Generating train split:  32%|###1      | 573000/1801350 [00:08<00:17, 69922.40 examples/s]Generating train split:  32%|###2      | 580002/1801350 [00:08<00:17, 69940.44 examples/s]Generating train split:  33%|###2      | 587071/1801350 [00:08<00:17, 70138.57 examples/s]Generating train split:  33%|###3      | 597605/1801350 [00:08<00:17, 70169.40 examples/s]Generating train split:  34%|###3      | 608093/1801350 [00:08<00:17, 70081.48 examples/s]Generating train split:  34%|###4      | 615105/1801350 [00:08<00:16, 70088.25 examples/s]Generating train split:  35%|###4      | 622165/1801350 [00:08<00:16, 70219.56 examples/s]Generating train split:  35%|###5      | 632698/1801350 [00:09<00:16, 70211.60 examples/s]Generating train split:  36%|###5      | 639737/1801350 [00:09<00:16, 70255.25 examples/s]Generating train split:  36%|###6      | 650144/1801350 [00:09<00:16, 69943.92 examples/s]Generating train split:  36%|###6      | 657205/1801350 [00:09<00:16, 70110.77 examples/s]Generating train split:  37%|###6      | 664260/1801350 [00:09<00:16, 70226.00 examples/s]Generating train split:  37%|###7      | 671340/1801350 [00:09<00:16, 70382.61 examples/s]Generating train split:  38%|###7      | 678393/1801350 [00:09<00:15, 70421.16 examples/s]Generating train split:  38%|###8      | 688862/1801350 [00:09<00:15, 70181.12 examples/s]Generating train split:  39%|###8      | 699298/1801350 [00:10<00:15, 69965.08 examples/s]Generating train split:  39%|###9      | 709785/1801350 [00:10<00:15, 69944.32 examples/s]Generating train split:  40%|###9      | 716820/1801350 [00:10<00:15, 70039.60 examples/s]Generating train split:  40%|####      | 723867/1801350 [00:10<00:15, 70149.12 examples/s]Generating train split:  41%|####      | 730932/1801350 [00:10<00:15, 70280.59 examples/s]Generating train split:  41%|####      | 738000/1801350 [00:10<00:15, 70174.76 examples/s]Generating train split:  41%|####1     | 745039/1801350 [00:10<00:15, 70233.87 examples/s]Generating train split:  42%|####1     | 752082/1801350 [00:10<00:14, 70288.03 examples/s]Generating train split:  42%|####2     | 762670/1801350 [00:10<00:14, 70398.43 examples/s]Generating train split:  43%|####2     | 773147/1801350 [00:11<00:14, 70200.58 examples/s]Generating train split:  43%|####3     | 780174/1801350 [00:11<00:14, 70216.12 examples/s]Generating train split:  44%|####3     | 790629/1801350 [00:11<00:14, 70031.16 examples/s]Generating train split:  44%|####4     | 801119/1801350 [00:11<00:14, 69994.71 examples/s]Generating train split:  45%|####4     | 808131/1801350 [00:11<00:14, 70022.79 examples/s]Generating train split:  45%|####5     | 815199/1801350 [00:11<00:14, 70189.61 examples/s]Generating train split:  46%|####5     | 822263/1801350 [00:11<00:13, 70308.83 examples/s]Generating train split:  46%|####6     | 832768/1801350 [00:11<00:13, 70203.92 examples/s]Generating train split:  47%|####6     | 839806/1801350 [00:12<00:13, 70246.60 examples/s]Generating train split:  47%|####7     | 850278/1801350 [00:12<00:13, 70086.40 examples/s]Generating train split:  48%|####7     | 860846/1801350 [00:12<00:13, 70207.48 examples/s]Generating train split:  48%|####8     | 867885/1801350 [00:12<00:13, 70249.60 examples/s]Generating train split:  49%|####8     | 874958/1801350 [00:12<00:13, 70370.23 examples/s]Generating train split:  49%|####8     | 882000/1801350 [00:12<00:13, 70148.95 examples/s]Generating train split:  50%|####9     | 892584/1801350 [00:12<00:12, 70295.87 examples/s]Generating train split:  50%|#####     | 903053/1801350 [00:12<00:12, 70120.53 examples/s]Generating train split:  51%|#####     | 913592/1801350 [00:13<00:12, 70165.28 examples/s]Generating train split:  51%|#####1    | 920625/1801350 [00:13<00:12, 70202.81 examples/s]Generating train split:  51%|#####1    | 927649/1801350 [00:13<00:12, 70208.66 examples/s]Generating train split:  52%|#####1    | 934687/1801350 [00:13<00:12, 70252.16 examples/s]Generating train split:  52%|#####2    | 945118/1801350 [00:13<00:12, 69989.96 examples/s]Generating train split:  53%|#####3    | 954721/1801350 [00:13<00:13, 62433.26 examples/s]Generating train split:  53%|#####3    | 961790/1801350 [00:13<00:13, 64332.41 examples/s]Generating train split:  54%|#####3    | 968807/1801350 [00:13<00:12, 65773.45 examples/s]Generating train split:  54%|#####4    | 975796/1801350 [00:14<00:12, 66846.79 examples/s]Generating train split:  55%|#####4    | 982807/1801350 [00:14<00:12, 67732.37 examples/s]Generating train split:  55%|#####4    | 989793/1801350 [00:14<00:11, 68325.61 examples/s]Generating train split:  55%|#####5    | 996866/1801350 [00:14<00:11, 69009.51 examples/s]Generating train split:  56%|#####5    | 1003892/1801350 [00:14<00:11, 69367.07 examples/s]Generating train split:  56%|#####6    | 1010944/1801350 [00:14<00:11, 69700.47 examples/s]Generating train split:  57%|#####6    | 1017989/1801350 [00:14<00:11, 69919.39 examples/s]Generating train split:  57%|#####7    | 1028536/1801350 [00:14<00:11, 69873.50 examples/s]Generating train split:  57%|#####7    | 1035594/1801350 [00:14<00:10, 70029.83 examples/s]Generating train split:  58%|#####7    | 1042635/1801350 [00:14<00:10, 70132.84 examples/s]Generating train split:  58%|#####8    | 1053076/1801350 [00:15<00:10, 69932.61 examples/s]Generating train split:  59%|#####9    | 1063649/1801350 [00:15<00:10, 70122.18 examples/s]Generating train split:  59%|#####9    | 1070689/1801350 [00:15<00:10, 70188.48 examples/s]Generating train split:  60%|######    | 1081157/1801350 [00:15<00:10, 70046.94 examples/s]Generating train split:  60%|######    | 1088195/1801350 [00:15<00:10, 70128.88 examples/s]Generating train split:  61%|######    | 1098741/1801350 [00:15<00:10, 70188.61 examples/s]Generating train split:  62%|######1   | 1109227/1801350 [00:15<00:09, 70091.20 examples/s]Generating train split:  62%|######2   | 1119819/1801350 [00:16<00:09, 70257.69 examples/s]Generating train split:  63%|######2   | 1126902/1801350 [00:16<00:09, 70390.16 examples/s]Generating train split:  63%|######2   | 1133951/1801350 [00:16<00:09, 70413.75 examples/s]Generating train split:  64%|######3   | 1144453/1801350 [00:16<00:09, 70271.93 examples/s]Generating train split:  64%|######4   | 1155000/1801350 [00:16<00:09, 70100.43 examples/s]Generating train split:  65%|######4   | 1165574/1801350 [00:16<00:09, 70224.90 examples/s]Generating train split:  65%|######5   | 1172625/1801350 [00:16<00:08, 70290.50 examples/s]Generating train split:  65%|######5   | 1179666/1801350 [00:16<00:08, 70317.65 examples/s]Generating train split:  66%|######6   | 1190090/1801350 [00:17<00:08, 70028.40 examples/s]Generating train split:  67%|######6   | 1200687/1801350 [00:17<00:08, 70230.21 examples/s]Generating train split:  67%|######7   | 1207743/1801350 [00:17<00:08, 70305.06 examples/s]Generating train split:  67%|######7   | 1214825/1801350 [00:17<00:08, 70434.32 examples/s]Generating train split:  68%|######7   | 1221942/1801350 [00:17<00:08, 70627.81 examples/s]Generating train split:  68%|######8   | 1232601/1801350 [00:17<00:08, 70618.68 examples/s]Generating train split:  69%|######9   | 1243082/1801350 [00:17<00:07, 70360.41 examples/s]Generating train split:  69%|######9   | 1250140/1801350 [00:17<00:07, 70412.01 examples/s]Generating train split:  70%|######9   | 1260727/1801350 [00:18<00:07, 70465.02 examples/s]Generating train split:  70%|#######   | 1267785/1801350 [00:18<00:07, 70490.60 examples/s]Generating train split:  71%|#######   | 1278264/1801350 [00:18<00:07, 70269.08 examples/s]Generating train split:  71%|#######1  | 1285369/1801350 [00:18<00:07, 70461.18 examples/s]Generating train split:  72%|#######1  | 1295989/1801350 [00:18<00:07, 70575.04 examples/s]Generating train split:  73%|#######2  | 1306556/1801350 [00:18<00:07, 70412.01 examples/s]Generating train split:  73%|#######3  | 1317000/1801350 [00:18<00:06, 70042.67 examples/s]Generating train split:  74%|#######3  | 1327515/1801350 [00:19<00:06, 69927.34 examples/s]Generating train split:  74%|#######4  | 1334553/1801350 [00:19<00:06, 69990.67 examples/s]Generating train split:  74%|#######4  | 1341608/1801350 [00:19<00:06, 70127.45 examples/s]Generating train split:  75%|#######4  | 1348643/1801350 [00:19<00:06, 70182.39 examples/s]Generating train split:  75%|#######5  | 1359074/1801350 [00:19<00:06, 69948.64 examples/s]Generating train split:  76%|#######6  | 1369599/1801350 [00:19<00:06, 70020.29 examples/s]Generating train split:  77%|#######6  | 1380000/1801350 [00:19<00:06, 69658.17 examples/s]Generating train split:  77%|#######6  | 1387000/1801350 [00:19<00:05, 69656.20 examples/s]Generating train split:  77%|#######7  | 1394000/1801350 [00:19<00:05, 69675.53 examples/s]Generating train split:  78%|#######7  | 1401000/1801350 [00:20<00:05, 69709.01 examples/s]Generating train split:  78%|#######8  | 1408000/1801350 [00:20<00:05, 69760.47 examples/s]Generating train split:  79%|#######8  | 1415000/1801350 [00:20<00:05, 69782.21 examples/s]Generating train split:  79%|#######8  | 1422000/1801350 [00:20<00:05, 69840.70 examples/s]Generating train split:  79%|#######9  | 1429059/1801350 [00:20<00:05, 70056.51 examples/s]Generating train split:  80%|#######9  | 1436161/1801350 [00:20<00:05, 70337.25 examples/s]Generating train split:  80%|########  | 1443206/1801350 [00:20<00:05, 70366.05 examples/s]Generating train split:  81%|########  | 1453720/1801350 [00:20<00:04, 70258.49 examples/s]Generating train split:  81%|########1 | 1464175/1801350 [00:20<00:04, 70054.95 examples/s]Generating train split:  82%|########1 | 1474674/1801350 [00:21<00:04, 70031.22 examples/s]Generating train split:  82%|########2 | 1485094/1801350 [00:21<00:04, 69846.62 examples/s]Generating train split:  83%|########2 | 1492134/1801350 [00:21<00:04, 69974.96 examples/s]Generating train split:  83%|########3 | 1499181/1801350 [00:21<00:04, 70098.41 examples/s]Generating train split:  84%|########3 | 1509770/1801350 [00:21<00:04, 70264.63 examples/s]Generating train split:  84%|########4 | 1520199/1801350 [00:21<00:04, 70016.76 examples/s]Generating train split:  85%|########4 | 1527211/1801350 [00:21<00:03, 70038.22 examples/s]Generating train split:  85%|########5 | 1537721/1801350 [00:22<00:03, 70045.74 examples/s]Generating train split:  86%|########5 | 1544747/1801350 [00:22<00:03, 70094.41 examples/s]Generating train split:  86%|########6 | 1551835/1801350 [00:22<00:03, 70294.87 examples/s]Generating train split:  87%|########6 | 1562246/1801350 [00:22<00:03, 69974.32 examples/s]Generating train split:  87%|########7 | 1569334/1801350 [00:22<00:03, 70202.64 examples/s]Generating train split:  88%|########7 | 1579911/1801350 [00:22<00:03, 70310.10 examples/s]Generating train split:  88%|########8 | 1590324/1801350 [00:22<00:03, 70010.28 examples/s]Generating train split:  89%|########8 | 1600876/1801350 [00:22<00:02, 70115.92 examples/s]Generating train split:  89%|########9 | 1611271/1801350 [00:23<00:02, 69853.32 examples/s]Generating train split:  90%|########9 | 1618278/1801350 [00:23<00:02, 69899.53 examples/s]Generating train split:  90%|######### | 1625321/1801350 [00:23<00:02, 70028.87 examples/s]Generating train split:  91%|######### | 1635897/1801350 [00:23<00:02, 70188.94 examples/s]Generating train split:  91%|#########1| 1642938/1801350 [00:23<00:02, 70242.61 examples/s]Generating train split:  92%|#########1| 1653417/1801350 [00:23<00:02, 70107.15 examples/s]Generating train split:  92%|#########2| 1662727/1801350 [00:23<00:02, 62465.60 examples/s]Generating train split:  93%|#########2| 1669782/1801350 [00:23<00:02, 64293.41 examples/s]Generating train split:  93%|#########3| 1676732/1801350 [00:24<00:01, 65564.05 examples/s]Generating train split:  93%|#########3| 1683705/1801350 [00:24<00:01, 66640.80 examples/s]Generating train split:  94%|#########3| 1690666/1801350 [00:24<00:01, 67441.49 examples/s]Generating train split:  94%|#########4| 1697637/1801350 [00:24<00:01, 68068.33 examples/s]Generating train split:  95%|#########4| 1704576/1801350 [00:24<00:01, 68443.35 examples/s]Generating train split:  95%|#########5| 1711608/1801350 [00:24<00:01, 68984.17 examples/s]Generating train split:  95%|#########5| 1718688/1801350 [00:24<00:01, 69512.23 examples/s]Generating train split:  96%|#########5| 1729094/1801350 [00:24<00:01, 69454.83 examples/s]Generating train split:  96%|#########6| 1736172/1801350 [00:24<00:00, 69808.45 examples/s]Generating train split:  97%|#########6| 1746673/1801350 [00:25<00:00, 69878.36 examples/s]Generating train split:  98%|#########7| 1757121/1801350 [00:25<00:00, 69797.72 examples/s]Generating train split:  98%|#########7| 1764118/1801350 [00:25<00:00, 69837.67 examples/s]Generating train split:  99%|#########8| 1774687/1801350 [00:25<00:00, 70048.59 examples/s]Generating train split:  99%|#########9| 1785156/1801350 [00:25<00:00, 69961.84 examples/s]Generating train split:  99%|#########9| 1792190/1801350 [00:25<00:00, 70050.59 examples/s]Generating train split: 100%|##########| 1801350/1801350 [00:25<00:00, 70040.93 examples/s]                                                                                           Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]                                                                            
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

global_filters = [minhash_dedup]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=global_filters)
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(ds) > len(pipeline.datasources[0]["dataset"])
[12/02/22 05:27:36] INFO     Original dataset size: 18014                                           2624608473.py:2
                    INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running filter: check_char_repetition on text                         4230721344.py:17
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
[12/02/22 05:27:37] INFO     Running filter: check_flagged_words on text                           4230721344.py:17
[12/02/22 05:27:38] INFO     Running cleaner: remove_empty_lines on text                           4230721344.py:69
[12/02/22 05:27:39] INFO     Running cleaner: normalize_whitespace on text                         4230721344.py:69
[12/02/22 05:27:40] INFO     Running global filter: minhash_dedup                                  4230721344.py:94
[12/02/22 05:28:32] INFO     Final dataset size: 10557                                             2624608473.py:17
# test the ability to skip global filters
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
ds_1 = load_dataset("lcama/elon-tweets", split="train")

datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
        "skip_global": False,
    },
    {
        "dataset": ds_1,
        "name": "elon",
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
        "skip_global": False,
    },
    # ...
]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=[minhash_dedup])
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(ds) > len(pipeline.datasources[0]["dataset"])
assert len(ds_1) > len(pipeline.datasources[1]["dataset"])
[12/02/22 05:31:30] INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running filter: check_char_repetition on text                         4230721344.py:17
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
                    INFO     Running filter: check_flagged_words on text                           4230721344.py:17
                    INFO     Running cleaner: remove_empty_lines on text                           4230721344.py:69
[12/02/22 05:31:31] INFO     Running cleaner: normalize_whitespace on text                         4230721344.py:69
                    INFO     Running datasource: elon                                              4230721344.py:43
                    INFO     Running filter: check_char_repetition on text                         4230721344.py:17
[12/02/22 05:31:32] INFO     Running filter: check_flagged_words on text                           4230721344.py:17
                    INFO     Running cleaner: remove_empty_lines on text                           4230721344.py:69
[12/02/22 05:31:33] INFO     Running cleaner: normalize_whitespace on text                         4230721344.py:69
[12/02/22 05:31:34] INFO     Running global filter: minhash_dedup                                  4230721344.py:94
[12/02/22 05:32:32] INFO     Final dataset size: 10557                                             1079502618.py:26
# test the ability to skip global filters
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
        "skip_global": False,
    },
    {
        "dataset": ds,
        "name": "wikitext1",
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
        "skip_global": True,
    },
    # ...
]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=global_filters)
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(pipeline.datasources[0]["dataset"]) < len(pipeline.datasources[1]["dataset"])
[12/02/22 05:33:45] INFO     Running datasource: wikitext                                          4230721344.py:43
                    INFO     Running filter: check_char_repetition on text                         4230721344.py:17
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
                    INFO     Running filter: check_flagged_words on text                           4230721344.py:17
                    INFO     Running cleaner: remove_empty_lines on text                           4230721344.py:69
                    INFO     Running cleaner: normalize_whitespace on text                         4230721344.py:69
                    INFO     Running datasource: wikitext1                                         4230721344.py:43
                    INFO     Running filter: check_char_repetition on text                         4230721344.py:17
                    INFO     Running filter: check_flagged_words on text                           4230721344.py:17
                    INFO     Running cleaner: remove_empty_lines on text                           4230721344.py:69
                    INFO     Running cleaner: normalize_whitespace on text                         4230721344.py:69
                    INFO     Running global filter: minhash_dedup                                  4230721344.py:94
[12/02/22 05:34:29] INFO     Final dataset size: 10557                                              173363204.py:23