filter

This module contains all the various filtering options supported.

source

check_compression_ratio

 check_compression_ratio (document, compression_threshold:float=0.5,
                          compression_level:int=3, dry_run=False)

Checks if the document is below the character repetition threshold.

Type Default Details
document document to be analyzed
compression_threshold float 0.5 threshold for compression ratio
compression_level int 3 compression level to use
dry_run bool False if True, returns the ratio of character repetition
Returns bool returns True if document is below threshold
test_str0 = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
test_str1 = "This is a test string."
assert check_compression_ratio(test_str0, dry_run=True) < check_compression_ratio(test_str1, dry_run=True)

source

check_char_repetition

 check_char_repetition (document, char_repetition_len=10,
                        char_repetition_threshold=0.2, dry_run=False)

Checks if the document is below the character repetition threshold.

Type Default Details
document document to be analyzed
char_repetition_len int 10 length of character repetition
char_repetition_threshold float 0.2 threshold for character repetition
dry_run bool False if True, returns the ratio of character repetition
Returns bool returns True if document is below threshold
test_str = "aaabbbcccddd"
assert check_char_repetition(test_str, char_repetition_len=3, char_repetition_threshold=0.2) == True

test_str = "aaaaaaabbbcccddd"
assert check_char_repetition(test_str, char_repetition_len=3, char_repetition_threshold=0.2) == False

source

check_flagged_words

 check_flagged_words (document:str, flagged_words:list=['anal', 'bbw',
                      'bdsm', 'blowjob', 'blowjobs', 'brazzers',
                      'bukkake', 'camgirl', 'camwhore', 'cocksucking',
                      'creampie', 'cuckold', 'cum', 'cumming', 'cums',
                      'cumshot', 'cumshots', 'cumslut', 'cunnilingus',
                      'deepthroat', 'deepthroating', 'dildo', 'dildos',
                      'dogging', 'doggystyle', 'erotic', 'fellatio',
                      'femdom', 'fingering', 'fisting', 'footjob',
                      'gangbang', 'handjob', 'hentai', 'horney',
                      'horniest', 'horny', 'jism', 'jizz', 'lolli',
                      'lolling', 'masterbating', 'masturbate',
                      'masturbating', 'masturbation', 'milf', 'orgies',
                      'orgy', 'pegging', 'porn', 'pornhub', 'porno',
                      'pornos', 'pornstar', 'pornstars', 'redtube',
                      'rimming', 'slutty', 'strapon', 'threesome',
                      'vibrator', 'xhamster', 'xnxx', 'xvideos', 'xxx',
                      'youporn'], flagged_words_threshold:float=0.1,
                      get_words_func:<built-infunctioncallable>=<function
                      get_words>, dry_run:bool=False)

Checks if a document contains a high percentage of flagged words.

Type Default Details
document str document to be analyzed
flagged_words list [‘anal’, ‘bbw’, ‘bdsm’, ‘blowjob’, ‘blowjobs’, ‘brazzers’, ‘bukkake’, ‘camgirl’, ‘camwhore’, ‘cocksucking’, ‘creampie’, ‘cuckold’, ‘cum’, ‘cumming’, ‘cums’, ‘cumshot’, ‘cumshots’, ‘cumslut’, ‘cunnilingus’, ‘deepthroat’, ‘deepthroating’, ‘dildo’, ‘dildos’, ‘dogging’, ‘doggystyle’, ‘erotic’, ‘fellatio’, ‘femdom’, ‘fingering’, ‘fisting’, ‘footjob’, ‘gangbang’, ‘handjob’, ‘hentai’, ‘horney’, ‘horniest’, ‘horny’, ‘jism’, ‘jizz’, ‘lolli’, ‘lolling’, ‘masterbating’, ‘masturbate’, ‘masturbating’, ‘masturbation’, ‘milf’, ‘orgies’, ‘orgy’, ‘pegging’, ‘porn’, ‘pornhub’, ‘porno’, ‘pornos’, ‘pornstar’, ‘pornstars’, ‘redtube’, ‘rimming’, ‘slutty’, ‘strapon’, ‘threesome’, ‘vibrator’, ‘xhamster’, ‘xnxx’, ‘xvideos’, ‘xxx’, ‘youporn’] list of flagged words
flagged_words_threshold float 0.1 threshold for flagged words
get_words_func callable get_words function to get words from document
dry_run bool False if True, returns the ratio of flagged words
Returns bool returns True if document is below threshold unless dry_run is True

The check_flagged_words filter function is purposefully hidden in this documentation as it would show the flagged words directly in the documentation, which might shock some people.

assert check_flagged_words("test") == True
assert check_flagged_words("bdsm") == False

source

check_perplexity

 check_perplexity (document, perplexity_threshold=10000, model=None,
                   dry_run=False)

Checks if the document is below the perplexity threshold.

Type Default Details
document document to be analyzed
perplexity_threshold int 10000 threshold for perplexity
model NoneType None model to calculate perplexity
dry_run bool False if True, returns the perplexity of the document
Returns bool returns True if document is below threshold

To run this test, you need to have kenlm and sentencepiece installed: pip install https://github.com/kpu/kenlm/archive/master.zip sentencepiece

from squeakily.helpers import KenlmModel

model = KenlmModel.from_pretrained(
    model_dataset="wikipedia",
    language="en",
    lower_case=True,
    remove_accents=True,
    normalize_numbers=True,
    punctuation=1,
)

low_test_str = "I am very perplexed"
high_test_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed ..."

assert check_perplexity(low_test_str, perplexity_threshold=1_000, model=model) == True
assert check_perplexity(high_test_str, perplexity_threshold=1_000, model=model) == False
/home/nathan/miniconda3/envs/squeakily/lib/python3.10/site-packages/huggingface_hub/file_download.py:592: FutureWarning: `cached_download` is the legacy way to download files from the HF hub, please consider upgrading to `hf_hub_download`
  warnings.warn(

source

check_language

 check_language (document, language='en', language_threshold=0.9,
                 model=None, dry_run=False)

Checks if the document is below the language threshold.

Type Default Details
document document to be analyzed
language str en language to check
language_threshold float 0.9 threshold for language
model NoneType None model to check language
dry_run bool False if True, returns the language of the document
Returns bool returns True if document is below threshold
from squeakily.helpers import FastTextLanguageDetector

fasttext_model = FastTextLanguageDetector.from_pretrained()

english_text = "Hi, my name is John."
spanish_text = "Hola, me llamo Juan."
chinese_text = "你好,我叫张三。"

assert check_language(english_text, language="en", language_threshold=0.85, model=fasttext_model) == True
assert check_language(spanish_text, language="en", language_threshold=0.85, model=fasttext_model) == False
assert check_language(chinese_text, language="en", language_threshold=0.85, model=fasttext_model) == False

# test dry run
assert check_language(english_text, language="en", language_threshold=0.85, model=fasttext_model, dry_run=True) > 0.
assert check_language(spanish_text, language="en", language_threshold=0.85, model=fasttext_model, dry_run=True) == -1.
assert check_language(chinese_text, language="es", language_threshold=0.85, model=fasttext_model, dry_run=True) == -1.
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.

source

check_word_number

 check_word_number (document, min_word_threshold=5,
                    max_word_threshold=100, get_words_func=<function
                    get_words>, dry_run=False)

Checks if the document is between the minimum and maximum word thresholds.

Type Default Details
document document to be analyzed
min_word_threshold int 5 minimum number of words
max_word_threshold int 100 maximum number of words
get_words_func function get_words function to get words from document
dry_run bool False if True, returns the number of words in the document
Returns bool returns True if document is between the minimum and maximum thresholds
test_str = "This is a test string."

assert check_word_number(test_str, min_word_threshold=5, max_word_threshold=10) == True
assert check_word_number(test_str, min_word_threshold=1, max_word_threshold=4) == False

source

check_stop_word_ratio

 check_stop_word_ratio (document, stop_word_threshold=0.3,
                        stop_words=['a', 'a.k.a', 'aboard', 'about',
                        'above', 'abt', 'accord', 'according', 'across',
                        'after', 'against', 'ago', 'aground', 'ahead',
                        'aka', 'ala', 'albeit', 'all', 'along',
                        'alongside', 'although', 'am', 'amid', 'amidst',
                        'among', 'amongst', 'amoung', 'an', 'and',
                        'and/or', 'another', 'any', 'any1', 'anybody',
                        'anyone', 'anything', 'are', 'around', 'as',
                        'aside', 'astride', 'at', 'atop', 'away', 'b',
                        'b/c', 'b/t', 'back', 'base', 'based', 'bc', 'be',
                        'because', 'been', 'before', 'behind', 'being',
                        'below', 'beneath', 'beside', 'besides',
                        'between', 'beyond', 'board', 'both', 'btwn',
                        'but', 'by', 'can', 'cause', 'circa', 'cos',
                        'could', 'coz', 'cus', 'depend', 'depending',
                        'despite', 'did', 'do', 'does', 'down', 'due',
                        'during', 'each', 'either', 'else', 'even',
                        'ever', 'every', 'everybody', 'everyone',
                        'everything', 'except', 'for', 'forth', 'from',
                        'get', 'gets', 'getting', 'give', 'given', 'got',
                        'had', 'half', 'has', 'hav', 'have', 'having',
                        'he', 'her', 'hers', 'herself', 'him', 'himself',
                        'his', 'how', 'however', 'i', "i'd", 'if', 'in',
                        'include', 'including', 'inside', 'instead',
                        'into', 'is', 'it', "it's", 'its', 'itself',
                        'lest', 'like', 'made', 'many', 'may', 'me',
                        'might', 'mine', 'minus', 'most', 'much', 'must',
                        'my', 'myself', 'nary', 'near', 'nearby',
                        'neither', 'next', 'nigh', 'no', 'nobody', 'none',
                        'noone', 'nor', 'not', 'nothing',
                        'notwithstanding', 'of', 'off', 'on', 'onboard',
                        'once', 'one', 'ones', 'oneself', 'only', 'onto',
                        'opposite', 'or', 'other', 'others', 'ought',
                        'our', 'ours', 'ourselves', 'out', 'outside',
                        'over', 'overt', 'own', 'past', 'per', 'plus',
                        'prior', 'quite', 'rather', 're', 'regard',
                        'regarding', 'regardless', 'round', 's/he',
                        'save', 'self', 'shall', 'she', 'should', 'side',
                        'since', 'so', 'some', 'somebody', 'someone',
                        'something', 'such', 'sure', 'teh', 'than',
                        'thanks', 'that', 'the', 'their', 'theirs',
                        'them', 'themselves', 'then', 'there', 'these',
                        'they', "they're", 'thier', 'this', 'tho',
                        'those', 'thou', 'though', 'through',
                        'throughout', 'thru', 'thy', 'til', 'till', 'to',
                        'together', 'too', 'toward', 'towards', 'u',
                        'under', 'underneath', 'unless', 'unlike',
                        'until', 'unto', 'up', 'upon', 'ur', 'us', 'use',
                        'versus', 'via', 'vs', 'vs.', 'w/', 'w/o',
                        'w/out', 'was', 'we', 'were', 'what', 'whatever',
                        'whatnot', 'when', 'whenever', 'where', 'whereas',
                        'wherever', 'whether', 'which', 'while', 'whilst',
                        'whither', 'who', "who's", 'whoever', 'whom',
                        'whomever', 'whose', 'why', 'will', 'with',
                        'within', 'without', 'wo', 'worth', 'would',
                        'wud', "y'all", 'ya', 'yet', 'yo', 'you',
                        "you're", 'your', 'youre', 'yours', 'yourself',
                        'yourselves'], get_words_func=<function
                        get_words>, dry_run=False)

Checks if the document contains a high percentage of stop words.

Type Default Details
document document to be analyzed
stop_word_threshold float 0.3 threshold for stop words
stop_words list [‘a’, ‘a.k.a’, ‘aboard’, ‘about’, ‘above’, ‘abt’, ‘accord’, ‘according’, ‘across’, ‘after’, ‘against’, ‘ago’, ‘aground’, ‘ahead’, ‘aka’, ‘ala’, ‘albeit’, ‘all’, ‘along’, ‘alongside’, ‘although’, ‘am’, ‘amid’, ‘amidst’, ‘among’, ‘amongst’, ‘amoung’, ‘an’, ‘and’, ‘and/or’, ‘another’, ‘any’, ‘any1’, ‘anybody’, ‘anyone’, ‘anything’, ‘are’, ‘around’, ‘as’, ‘aside’, ‘astride’, ‘at’, ‘atop’, ‘away’, ‘b’, ‘b/c’, ‘b/t’, ‘back’, ‘base’, ‘based’, ‘bc’, ‘be’, ‘because’, ‘been’, ‘before’, ‘behind’, ‘being’, ‘below’, ‘beneath’, ‘beside’, ‘besides’, ‘between’, ‘beyond’, ‘board’, ‘both’, ‘btwn’, ‘but’, ‘by’, ‘can’, ‘cause’, ‘circa’, ‘cos’, ‘could’, ‘coz’, ‘cus’, ‘depend’, ‘depending’, ‘despite’, ‘did’, ‘do’, ‘does’, ‘down’, ‘due’, ‘during’, ‘each’, ‘either’, ‘else’, ‘even’, ‘ever’, ‘every’, ‘everybody’, ‘everyone’, ‘everything’, ‘except’, ‘for’, ‘forth’, ‘from’, ‘get’, ‘gets’, ‘getting’, ‘give’, ‘given’, ‘got’, ‘had’, ‘half’, ‘has’, ‘hav’, ‘have’, ‘having’, ‘he’, ‘her’, ‘hers’, ‘herself’, ‘him’, ‘himself’, ‘his’, ‘how’, ‘however’, ‘i’, “i’d”, ‘if’, ‘in’, ‘include’, ‘including’, ‘inside’, ‘instead’, ‘into’, ‘is’, ‘it’, “it’s”, ‘its’, ‘itself’, ‘lest’, ‘like’, ‘made’, ‘many’, ‘may’, ‘me’, ‘might’, ‘mine’, ‘minus’, ‘most’, ‘much’, ‘must’, ‘my’, ‘myself’, ‘nary’, ‘near’, ‘nearby’, ‘neither’, ‘next’, ‘nigh’, ‘no’, ‘nobody’, ‘none’, ‘noone’, ‘nor’, ‘not’, ‘nothing’, ‘notwithstanding’, ‘of’, ‘off’, ‘on’, ‘onboard’, ‘once’, ‘one’, ‘ones’, ‘oneself’, ‘only’, ‘onto’, ‘opposite’, ‘or’, ‘other’, ‘others’, ‘ought’, ‘our’, ‘ours’, ‘ourselves’, ‘out’, ‘outside’, ‘over’, ‘overt’, ‘own’, ‘past’, ‘per’, ‘plus’, ‘prior’, ‘quite’, ‘rather’, ‘re’, ‘regard’, ‘regarding’, ‘regardless’, ‘round’, ‘s/he’, ‘save’, ‘self’, ‘shall’, ‘she’, ‘should’, ‘side’, ‘since’, ‘so’, ‘some’, ‘somebody’, ‘someone’, ‘something’, ‘such’, ‘sure’, ‘teh’, ‘than’, ‘thanks’, ‘that’, ‘the’, ‘their’, ‘theirs’, ‘them’, ‘themselves’, ‘then’, ‘there’, ‘these’, ‘they’, “they’re”, ‘thier’, ‘this’, ‘tho’, ‘those’, ‘thou’, ‘though’, ‘through’, ‘throughout’, ‘thru’, ‘thy’, ‘til’, ‘till’, ‘to’, ‘together’, ‘too’, ‘toward’, ‘towards’, ‘u’, ‘under’, ‘underneath’, ‘unless’, ‘unlike’, ‘until’, ‘unto’, ‘up’, ‘upon’, ‘ur’, ‘us’, ‘use’, ‘versus’, ‘via’, ‘vs’, ‘vs.’, ‘w/’, ‘w/o’, ‘w/out’, ‘was’, ‘we’, ‘were’, ‘what’, ‘whatever’, ‘whatnot’, ‘when’, ‘whenever’, ‘where’, ‘whereas’, ‘wherever’, ‘whether’, ‘which’, ‘while’, ‘whilst’, ‘whither’, ‘who’, “who’s”, ‘whoever’, ‘whom’, ‘whomever’, ‘whose’, ‘why’, ‘will’, ‘with’, ‘within’, ‘without’, ‘wo’, ‘worth’, ‘would’, ‘wud’, “y’all”, ‘ya’, ‘yet’, ‘yo’, ‘you’, “you’re”, ‘your’, ‘youre’, ‘yours’, ‘yourself’, ‘yourselves’] list of stop words
get_words_func function get_words function to get words from document
dry_run bool False if True, returns the ratio of stop words in the document
Returns bool returns True if document is below the threshold
assert check_stop_word_ratio("test") == True
assert check_stop_word_ratio("the") == False
assert check_stop_word_ratio("the funny llama", stop_word_threshold=0.3) == False
assert check_stop_word_ratio("the funny llama", stop_word_threshold=0.5) == True
# Test french stop words
assert check_stop_word_ratio("le", stop_words=stopwords['fr']) == False
assert check_stop_word_ratio("le chien est beau", stop_words=stopwords['fr'], dry_run=True) == 0.5
assert check_stop_word_ratio("le chien est beau", stop_words=stopwords['fr'], stop_word_threshold=0.3) == False

source

check_code_parsability

 check_code_parsability (document, program_language='python')

Checks if the document contains parsable code.

Type Default Details
document document to be analyzed
program_language str python programming language to check
Returns bool returns True if the code is parsable
# Test python code
assert check_code_parsability("print('hello world')", program_language="python") == True
assert check_code_parsability("print('hello world'", program_language="python") == False
# Test javascript code
assert check_code_parsability("console.log('hello world')", program_language="javascript") == True
assert check_code_parsability("console.log('hello world'", program_language="javascript") == False
WARNING:root:Autoloading AST parser for javascript: Start download from Github.
WARNING:root:Start cloning the parser definition from Github.
WARNING:root:Compiling language for javascript

Whole Dataset Filtering

MinHash Deduplication

The following code has all been adapted from the awesome Chenghao Mou and their work on the BigCode repository!

result = _hash_func(0, "Hello world!", num_perm=128)
assert result["__id__"] == 0
assert result["__signature__"].shape == (128,)
assert result["__signature__"].dtype == np.uint64
data = ["Hello world!", "Hello world"]
signatures = [_hash_func(i, content, num_perm=128) for i, content in enumerate(data)]
index = MinHashLSH(threshold=0.5, num_perm=128)
for signature in signatures:
    index.insert(
        signature["__id__"],
        MinHash(num_perm=128, hashvalues=signature["__signature__"], seed=MINHASH_SEED)
    )
assert _query_content(0, signatures[0]["__signature__"], index=index) == {'__neighbors__': [1], '__id__': 0}
assert _query_content(1, signatures[1]["__signature__"], index=index) == {'__neighbors__': [0], '__id__': 1}
assert _jaccard_similarity("a = 1", "a = 2") == 0.3333333333333333
assert _jaccard_similarity("a = 1", "a = 1") == 1.0

source

minhash_dedup

 minhash_dedup (ds, column, community_detection:bool=False,
                report_false_positive_rate:bool=False,
                threshold:float=0.85, num_perm:int=128,
                dry_run:bool=False)

Deduplicate the dataset using minhashing as described in the paper “Deduplicating Training Data Makes Language Models Better”.

Type Default Details
ds The dataset to deduplicate.
column The column to use for deduplication.
community_detection bool False Whether to use community detection to find the duplicate communities, or to use the connected components.
report_false_positive_rate bool False Whether to report the false positive rate.
threshold float 0.85 The threshold to use for deduplication.
num_perm int 128 The number of permutations to use for minhashing.
dry_run bool False Whether to run the deduplication in dry run mode.
Returns Dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
dataset = dataset.select(range(1_000))
deduped_dataset = minhash_dedup(dataset, "text", community_detection=True, threshold=0.85, num_perm=128)

assert len(deduped_dataset) == len(dataset) - len(dup_ids)
assert deduped_dataset.column_names == dataset.column_names + ["__id__"]
                                                                                                                                                                
# test dry run
deduped_dataset = minhash_dedup(dataset, "text", community_detection=True, threshold=0.85, num_perm=128, dry_run=True)

assert len(deduped_dataset) == len(dataset)
assert deduped_dataset.column_names == dataset.column_names + ["__id__", "duplicate"]
                                                                                                                                                                
# print which records were removed
for idx in dup_ids:
    if dataset[idx]["text"] == "":
        continue
    print(dataset[idx]["text"])
 Flower Fairies of the Spring ; Blackie , 1923 

 = = = Regular season = = = 

 " There 's Got to Be a Way " ( 12 " remix ) 

 = = Early life = = 

 = = Awards = = 

 = = Critical reception = = 

 = = History = = 

 = = Service history = = 

 = = Description = = 

 = = Background = =