= "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
test_str0 = "This is a test string."
test_str1 assert check_compression_ratio(test_str0, dry_run=True) < check_compression_ratio(test_str1, dry_run=True)
filter
check_compression_ratio
check_compression_ratio (document, compression_threshold:float=0.5, compression_level:int=3, dry_run=False)
Checks if the document is below the character repetition threshold.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
compression_threshold | float | 0.5 | threshold for compression ratio |
compression_level | int | 3 | compression level to use |
dry_run | bool | False | if True, returns the ratio of character repetition |
Returns | bool | returns True if document is below threshold |
check_char_repetition
check_char_repetition (document, char_repetition_len=10, char_repetition_threshold=0.2, dry_run=False)
Checks if the document is below the character repetition threshold.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
char_repetition_len | int | 10 | length of character repetition |
char_repetition_threshold | float | 0.2 | threshold for character repetition |
dry_run | bool | False | if True, returns the ratio of character repetition |
Returns | bool | returns True if document is below threshold |
= "aaabbbcccddd"
test_str assert check_char_repetition(test_str, char_repetition_len=3, char_repetition_threshold=0.2) == True
= "aaaaaaabbbcccddd"
test_str assert check_char_repetition(test_str, char_repetition_len=3, char_repetition_threshold=0.2) == False
check_flagged_words
check_flagged_words (document:str, flagged_words:list=['anal', 'bbw', 'bdsm', 'blowjob', 'blowjobs', 'brazzers', 'bukkake', 'camgirl', 'camwhore', 'cocksucking', 'creampie', 'cuckold', 'cum', 'cumming', 'cums', 'cumshot', 'cumshots', 'cumslut', 'cunnilingus', 'deepthroat', 'deepthroating', 'dildo', 'dildos', 'dogging', 'doggystyle', 'erotic', 'fellatio', 'femdom', 'fingering', 'fisting', 'footjob', 'gangbang', 'handjob', 'hentai', 'horney', 'horniest', 'horny', 'jism', 'jizz', 'lolli', 'lolling', 'masterbating', 'masturbate', 'masturbating', 'masturbation', 'milf', 'orgies', 'orgy', 'pegging', 'porn', 'pornhub', 'porno', 'pornos', 'pornstar', 'pornstars', 'redtube', 'rimming', 'slutty', 'strapon', 'threesome', 'vibrator', 'xhamster', 'xnxx', 'xvideos', 'xxx', 'youporn'], flagged_words_threshold:float=0.1, get_words_func:<built-infunctioncallable>=<function get_words>, dry_run:bool=False)
Checks if a document contains a high percentage of flagged words.
Type | Default | Details | |
---|---|---|---|
document | str | document to be analyzed | |
flagged_words | list | [‘anal’, ‘bbw’, ‘bdsm’, ‘blowjob’, ‘blowjobs’, ‘brazzers’, ‘bukkake’, ‘camgirl’, ‘camwhore’, ‘cocksucking’, ‘creampie’, ‘cuckold’, ‘cum’, ‘cumming’, ‘cums’, ‘cumshot’, ‘cumshots’, ‘cumslut’, ‘cunnilingus’, ‘deepthroat’, ‘deepthroating’, ‘dildo’, ‘dildos’, ‘dogging’, ‘doggystyle’, ‘erotic’, ‘fellatio’, ‘femdom’, ‘fingering’, ‘fisting’, ‘footjob’, ‘gangbang’, ‘handjob’, ‘hentai’, ‘horney’, ‘horniest’, ‘horny’, ‘jism’, ‘jizz’, ‘lolli’, ‘lolling’, ‘masterbating’, ‘masturbate’, ‘masturbating’, ‘masturbation’, ‘milf’, ‘orgies’, ‘orgy’, ‘pegging’, ‘porn’, ‘pornhub’, ‘porno’, ‘pornos’, ‘pornstar’, ‘pornstars’, ‘redtube’, ‘rimming’, ‘slutty’, ‘strapon’, ‘threesome’, ‘vibrator’, ‘xhamster’, ‘xnxx’, ‘xvideos’, ‘xxx’, ‘youporn’] | list of flagged words |
flagged_words_threshold | float | 0.1 | threshold for flagged words |
get_words_func | callable | get_words | function to get words from document |
dry_run | bool | False | if True, returns the ratio of flagged words |
Returns | bool | returns True if document is below threshold unless dry_run is True |
The check_flagged_words
filter function is purposefully hidden in this documentation as it would show the flagged words directly in the documentation, which might shock some people.
assert check_flagged_words("test") == True
assert check_flagged_words("bdsm") == False
check_perplexity
check_perplexity (document, perplexity_threshold=10000, model=None, dry_run=False)
Checks if the document is below the perplexity threshold.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
perplexity_threshold | int | 10000 | threshold for perplexity |
model | NoneType | None | model to calculate perplexity |
dry_run | bool | False | if True, returns the perplexity of the document |
Returns | bool | returns True if document is below threshold |
To run this test, you need to have kenlm and sentencepiece installed: pip install https://github.com/kpu/kenlm/archive/master.zip sentencepiece
from squeakily.helpers import KenlmModel
= KenlmModel.from_pretrained(
model ="wikipedia",
model_dataset="en",
language=True,
lower_case=True,
remove_accents=True,
normalize_numbers=1,
punctuation
)
= "I am very perplexed"
low_test_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed ..."
high_test_str
assert check_perplexity(low_test_str, perplexity_threshold=1_000, model=model) == True
assert check_perplexity(high_test_str, perplexity_threshold=1_000, model=model) == False
/home/nathan/miniconda3/envs/squeakily/lib/python3.10/site-packages/huggingface_hub/file_download.py:592: FutureWarning: `cached_download` is the legacy way to download files from the HF hub, please consider upgrading to `hf_hub_download`
warnings.warn(
check_language
check_language (document, language='en', language_threshold=0.9, model=None, dry_run=False)
Checks if the document is below the language threshold.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
language | str | en | language to check |
language_threshold | float | 0.9 | threshold for language |
model | NoneType | None | model to check language |
dry_run | bool | False | if True, returns the language of the document |
Returns | bool | returns True if document is below threshold |
from squeakily.helpers import FastTextLanguageDetector
= FastTextLanguageDetector.from_pretrained()
fasttext_model
= "Hi, my name is John."
english_text = "Hola, me llamo Juan."
spanish_text = "你好,我叫张三。"
chinese_text
assert check_language(english_text, language="en", language_threshold=0.85, model=fasttext_model) == True
assert check_language(spanish_text, language="en", language_threshold=0.85, model=fasttext_model) == False
assert check_language(chinese_text, language="en", language_threshold=0.85, model=fasttext_model) == False
# test dry run
assert check_language(english_text, language="en", language_threshold=0.85, model=fasttext_model, dry_run=True) > 0.
assert check_language(spanish_text, language="en", language_threshold=0.85, model=fasttext_model, dry_run=True) == -1.
assert check_language(chinese_text, language="es", language_threshold=0.85, model=fasttext_model, dry_run=True) == -1.
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
check_word_number
check_word_number (document, min_word_threshold=5, max_word_threshold=100, get_words_func=<function get_words>, dry_run=False)
Checks if the document is between the minimum and maximum word thresholds.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
min_word_threshold | int | 5 | minimum number of words |
max_word_threshold | int | 100 | maximum number of words |
get_words_func | function | get_words | function to get words from document |
dry_run | bool | False | if True, returns the number of words in the document |
Returns | bool | returns True if document is between the minimum and maximum thresholds |
= "This is a test string."
test_str
assert check_word_number(test_str, min_word_threshold=5, max_word_threshold=10) == True
assert check_word_number(test_str, min_word_threshold=1, max_word_threshold=4) == False
check_stop_word_ratio
check_stop_word_ratio (document, stop_word_threshold=0.3, stop_words=['a', 'a.k.a', 'aboard', 'about', 'above', 'abt', 'accord', 'according', 'across', 'after', 'against', 'ago', 'aground', 'ahead', 'aka', 'ala', 'albeit', 'all', 'along', 'alongside', 'although', 'am', 'amid', 'amidst', 'among', 'amongst', 'amoung', 'an', 'and', 'and/or', 'another', 'any', 'any1', 'anybody', 'anyone', 'anything', 'are', 'around', 'as', 'aside', 'astride', 'at', 'atop', 'away', 'b', 'b/c', 'b/t', 'back', 'base', 'based', 'bc', 'be', 'because', 'been', 'before', 'behind', 'being', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'board', 'both', 'btwn', 'but', 'by', 'can', 'cause', 'circa', 'cos', 'could', 'coz', 'cus', 'depend', 'depending', 'despite', 'did', 'do', 'does', 'down', 'due', 'during', 'each', 'either', 'else', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'except', 'for', 'forth', 'from', 'get', 'gets', 'getting', 'give', 'given', 'got', 'had', 'half', 'has', 'hav', 'have', 'having', 'he', 'her', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'i', "i'd", 'if', 'in', 'include', 'including', 'inside', 'instead', 'into', 'is', 'it', "it's", 'its', 'itself', 'lest', 'like', 'made', 'many', 'may', 'me', 'might', 'mine', 'minus', 'most', 'much', 'must', 'my', 'myself', 'nary', 'near', 'nearby', 'neither', 'next', 'nigh', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'notwithstanding', 'of', 'off', 'on', 'onboard', 'once', 'one', 'ones', 'oneself', 'only', 'onto', 'opposite', 'or', 'other', 'others', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overt', 'own', 'past', 'per', 'plus', 'prior', 'quite', 'rather', 're', 'regard', 'regarding', 'regardless', 'round', 's/he', 'save', 'self', 'shall', 'she', 'should', 'side', 'since', 'so', 'some', 'somebody', 'someone', 'something', 'such', 'sure', 'teh', 'than', 'thanks', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', "they're", 'thier', 'this', 'tho', 'those', 'thou', 'though', 'through', 'throughout', 'thru', 'thy', 'til', 'till', 'to', 'together', 'too', 'toward', 'towards', 'u', 'under', 'underneath', 'unless', 'unlike', 'until', 'unto', 'up', 'upon', 'ur', 'us', 'use', 'versus', 'via', 'vs', 'vs.', 'w/', 'w/o', 'w/out', 'was', 'we', 'were', 'what', 'whatever', 'whatnot', 'when', 'whenever', 'where', 'whereas', 'wherever', 'whether', 'which', 'while', 'whilst', 'whither', 'who', "who's", 'whoever', 'whom', 'whomever', 'whose', 'why', 'will', 'with', 'within', 'without', 'wo', 'worth', 'would', 'wud', "y'all", 'ya', 'yet', 'yo', 'you', "you're", 'your', 'youre', 'yours', 'yourself', 'yourselves'], get_words_func=<function get_words>, dry_run=False)
Checks if the document contains a high percentage of stop words.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
stop_word_threshold | float | 0.3 | threshold for stop words |
stop_words | list | [‘a’, ‘a.k.a’, ‘aboard’, ‘about’, ‘above’, ‘abt’, ‘accord’, ‘according’, ‘across’, ‘after’, ‘against’, ‘ago’, ‘aground’, ‘ahead’, ‘aka’, ‘ala’, ‘albeit’, ‘all’, ‘along’, ‘alongside’, ‘although’, ‘am’, ‘amid’, ‘amidst’, ‘among’, ‘amongst’, ‘amoung’, ‘an’, ‘and’, ‘and/or’, ‘another’, ‘any’, ‘any1’, ‘anybody’, ‘anyone’, ‘anything’, ‘are’, ‘around’, ‘as’, ‘aside’, ‘astride’, ‘at’, ‘atop’, ‘away’, ‘b’, ‘b/c’, ‘b/t’, ‘back’, ‘base’, ‘based’, ‘bc’, ‘be’, ‘because’, ‘been’, ‘before’, ‘behind’, ‘being’, ‘below’, ‘beneath’, ‘beside’, ‘besides’, ‘between’, ‘beyond’, ‘board’, ‘both’, ‘btwn’, ‘but’, ‘by’, ‘can’, ‘cause’, ‘circa’, ‘cos’, ‘could’, ‘coz’, ‘cus’, ‘depend’, ‘depending’, ‘despite’, ‘did’, ‘do’, ‘does’, ‘down’, ‘due’, ‘during’, ‘each’, ‘either’, ‘else’, ‘even’, ‘ever’, ‘every’, ‘everybody’, ‘everyone’, ‘everything’, ‘except’, ‘for’, ‘forth’, ‘from’, ‘get’, ‘gets’, ‘getting’, ‘give’, ‘given’, ‘got’, ‘had’, ‘half’, ‘has’, ‘hav’, ‘have’, ‘having’, ‘he’, ‘her’, ‘hers’, ‘herself’, ‘him’, ‘himself’, ‘his’, ‘how’, ‘however’, ‘i’, “i’d”, ‘if’, ‘in’, ‘include’, ‘including’, ‘inside’, ‘instead’, ‘into’, ‘is’, ‘it’, “it’s”, ‘its’, ‘itself’, ‘lest’, ‘like’, ‘made’, ‘many’, ‘may’, ‘me’, ‘might’, ‘mine’, ‘minus’, ‘most’, ‘much’, ‘must’, ‘my’, ‘myself’, ‘nary’, ‘near’, ‘nearby’, ‘neither’, ‘next’, ‘nigh’, ‘no’, ‘nobody’, ‘none’, ‘noone’, ‘nor’, ‘not’, ‘nothing’, ‘notwithstanding’, ‘of’, ‘off’, ‘on’, ‘onboard’, ‘once’, ‘one’, ‘ones’, ‘oneself’, ‘only’, ‘onto’, ‘opposite’, ‘or’, ‘other’, ‘others’, ‘ought’, ‘our’, ‘ours’, ‘ourselves’, ‘out’, ‘outside’, ‘over’, ‘overt’, ‘own’, ‘past’, ‘per’, ‘plus’, ‘prior’, ‘quite’, ‘rather’, ‘re’, ‘regard’, ‘regarding’, ‘regardless’, ‘round’, ‘s/he’, ‘save’, ‘self’, ‘shall’, ‘she’, ‘should’, ‘side’, ‘since’, ‘so’, ‘some’, ‘somebody’, ‘someone’, ‘something’, ‘such’, ‘sure’, ‘teh’, ‘than’, ‘thanks’, ‘that’, ‘the’, ‘their’, ‘theirs’, ‘them’, ‘themselves’, ‘then’, ‘there’, ‘these’, ‘they’, “they’re”, ‘thier’, ‘this’, ‘tho’, ‘those’, ‘thou’, ‘though’, ‘through’, ‘throughout’, ‘thru’, ‘thy’, ‘til’, ‘till’, ‘to’, ‘together’, ‘too’, ‘toward’, ‘towards’, ‘u’, ‘under’, ‘underneath’, ‘unless’, ‘unlike’, ‘until’, ‘unto’, ‘up’, ‘upon’, ‘ur’, ‘us’, ‘use’, ‘versus’, ‘via’, ‘vs’, ‘vs.’, ‘w/’, ‘w/o’, ‘w/out’, ‘was’, ‘we’, ‘were’, ‘what’, ‘whatever’, ‘whatnot’, ‘when’, ‘whenever’, ‘where’, ‘whereas’, ‘wherever’, ‘whether’, ‘which’, ‘while’, ‘whilst’, ‘whither’, ‘who’, “who’s”, ‘whoever’, ‘whom’, ‘whomever’, ‘whose’, ‘why’, ‘will’, ‘with’, ‘within’, ‘without’, ‘wo’, ‘worth’, ‘would’, ‘wud’, “y’all”, ‘ya’, ‘yet’, ‘yo’, ‘you’, “you’re”, ‘your’, ‘youre’, ‘yours’, ‘yourself’, ‘yourselves’] | list of stop words |
get_words_func | function | get_words | function to get words from document |
dry_run | bool | False | if True, returns the ratio of stop words in the document |
Returns | bool | returns True if document is below the threshold |
assert check_stop_word_ratio("test") == True
assert check_stop_word_ratio("the") == False
assert check_stop_word_ratio("the funny llama", stop_word_threshold=0.3) == False
assert check_stop_word_ratio("the funny llama", stop_word_threshold=0.5) == True
# Test french stop words
assert check_stop_word_ratio("le", stop_words=stopwords['fr']) == False
assert check_stop_word_ratio("le chien est beau", stop_words=stopwords['fr'], dry_run=True) == 0.5
assert check_stop_word_ratio("le chien est beau", stop_words=stopwords['fr'], stop_word_threshold=0.3) == False
check_code_parsability
check_code_parsability (document, program_language='python')
Checks if the document contains parsable code.
Type | Default | Details | |
---|---|---|---|
document | document to be analyzed | ||
program_language | str | python | programming language to check |
Returns | bool | returns True if the code is parsable |
# Test python code
assert check_code_parsability("print('hello world')", program_language="python") == True
assert check_code_parsability("print('hello world'", program_language="python") == False
# Test javascript code
assert check_code_parsability("console.log('hello world')", program_language="javascript") == True
assert check_code_parsability("console.log('hello world'", program_language="javascript") == False
WARNING:root:Autoloading AST parser for javascript: Start download from Github.
WARNING:root:Start cloning the parser definition from Github.
WARNING:root:Compiling language for javascript
Whole Dataset Filtering
MinHash Deduplication
The following code has all been adapted from the awesome Chenghao Mou and their work on the BigCode repository!
= _hash_func(0, "Hello world!", num_perm=128)
result assert result["__id__"] == 0
assert result["__signature__"].shape == (128,)
assert result["__signature__"].dtype == np.uint64
= ["Hello world!", "Hello world"]
data = [_hash_func(i, content, num_perm=128) for i, content in enumerate(data)]
signatures = MinHashLSH(threshold=0.5, num_perm=128)
index for signature in signatures:
index.insert("__id__"],
signature[=128, hashvalues=signature["__signature__"], seed=MINHASH_SEED)
MinHash(num_perm
)assert _query_content(0, signatures[0]["__signature__"], index=index) == {'__neighbors__': [1], '__id__': 0}
assert _query_content(1, signatures[1]["__signature__"], index=index) == {'__neighbors__': [0], '__id__': 1}
assert _jaccard_similarity("a = 1", "a = 2") == 0.3333333333333333
assert _jaccard_similarity("a = 1", "a = 1") == 1.0
minhash_dedup
minhash_dedup (ds, column, community_detection:bool=False, report_false_positive_rate:bool=False, threshold:float=0.85, num_perm:int=128, dry_run:bool=False)
Deduplicate the dataset using minhashing as described in the paper “Deduplicating Training Data Makes Language Models Better”.
Type | Default | Details | |
---|---|---|---|
ds | The dataset to deduplicate. | ||
column | The column to use for deduplication. | ||
community_detection | bool | False | Whether to use community detection to find the duplicate communities, or to use the connected components. |
report_false_positive_rate | bool | False | Whether to report the false positive rate. |
threshold | float | 0.85 | The threshold to use for deduplication. |
num_perm | int | 128 | The number of permutations to use for minhashing. |
dry_run | bool | False | Whether to run the deduplication in dry run mode. |
Returns | Dataset |
= load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
dataset = dataset.select(range(1_000))
dataset = minhash_dedup(dataset, "text", community_detection=True, threshold=0.85, num_perm=128)
deduped_dataset
assert len(deduped_dataset) == len(dataset) - len(dup_ids)
assert deduped_dataset.column_names == dataset.column_names + ["__id__"]
# test dry run
= minhash_dedup(dataset, "text", community_detection=True, threshold=0.85, num_perm=128, dry_run=True)
deduped_dataset
assert len(deduped_dataset) == len(dataset)
assert deduped_dataset.column_names == dataset.column_names + ["__id__", "duplicate"]
# print which records were removed
for idx in dup_ids:
if dataset[idx]["text"] == "":
continue
print(dataset[idx]["text"])
Flower Fairies of the Spring ; Blackie , 1923
= = = Regular season = = =
" There 's Got to Be a Way " ( 12 " remix )
= = Early life = =
= = Awards = =
= = Critical reception = =
= = History = =
= = Service history = =
= = Description = =
= = Background = =