filter

This module contains all the various filtering options supported.

check_compression_ratio

 check_compression_ratio (document, compression_threshold:float=0.5,
                          compression_level:int=3, dry_run=False)

Checks if the document is below the character repetition threshold.

	Type	Default	Details
document			document to be analyzed
compression_threshold	float	0.5	threshold for compression ratio
compression_level	int	3	compression level to use
dry_run	bool	False	if True, returns the ratio of character repetition
Returns	bool		returns True if document is below threshold

test_str0 = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
test_str1 = "This is a test string."
assert check_compression_ratio(test_str0, dry_run=True) < check_compression_ratio(test_str1, dry_run=True)

source

check_char_repetition

 check_char_repetition (document, char_repetition_len=10,
                        char_repetition_threshold=0.2, dry_run=False)

Checks if the document is below the character repetition threshold.

	Type	Default	Details
document			document to be analyzed
char_repetition_len	int	10	length of character repetition
char_repetition_threshold	float	0.2	threshold for character repetition
dry_run	bool	False	if True, returns the ratio of character repetition
Returns	bool		returns True if document is below threshold

test_str = "aaabbbcccddd"
assert check_char_repetition(test_str, char_repetition_len=3, char_repetition_threshold=0.2) == True

test_str = "aaaaaaabbbcccddd"
assert check_char_repetition(test_str, char_repetition_len=3, char_repetition_threshold=0.2) == False

source

check_flagged_words

 check_flagged_words (document:str, flagged_words:list=['anal', 'bbw',
                      'bdsm', 'blowjob', 'blowjobs', 'brazzers',
                      'bukkake', 'camgirl', 'camwhore', 'cocksucking',
                      'creampie', 'cuckold', 'cum', 'cumming', 'cums',
                      'cumshot', 'cumshots', 'cumslut', 'cunnilingus',
                      'deepthroat', 'deepthroating', 'dildo', 'dildos',
                      'dogging', 'doggystyle', 'erotic', 'fellatio',
                      'femdom', 'fingering', 'fisting', 'footjob',
                      'gangbang', 'handjob', 'hentai', 'horney',
                      'horniest', 'horny', 'jism', 'jizz', 'lolli',
                      'lolling', 'masterbating', 'masturbate',
                      'masturbating', 'masturbation', 'milf', 'orgies',
                      'orgy', 'pegging', 'porn', 'pornhub', 'porno',
                      'pornos', 'pornstar', 'pornstars', 'redtube',
                      'rimming', 'slutty', 'strapon', 'threesome',
                      'vibrator', 'xhamster', 'xnxx', 'xvideos', 'xxx',
                      'youporn'], flagged_words_threshold:float=0.1,
                      get_words_func:<built-infunctioncallable>=<function
                      get_words>, dry_run:bool=False)

Checks if a document contains a high percentage of flagged words.

	Type	Default	Details
document	str		document to be analyzed
flagged_words	list	[‘anal’, ‘bbw’, ‘bdsm’, ‘blowjob’, ‘blowjobs’, ‘brazzers’, ‘bukkake’, ‘camgirl’, ‘camwhore’, ‘cocksucking’, ‘creampie’, ‘cuckold’, ‘cum’, ‘cumming’, ‘cums’, ‘cumshot’, ‘cumshots’, ‘cumslut’, ‘cunnilingus’, ‘deepthroat’, ‘deepthroating’, ‘dildo’, ‘dildos’, ‘dogging’, ‘doggystyle’, ‘erotic’, ‘fellatio’, ‘femdom’, ‘fingering’, ‘fisting’, ‘footjob’, ‘gangbang’, ‘handjob’, ‘hentai’, ‘horney’, ‘horniest’, ‘horny’, ‘jism’, ‘jizz’, ‘lolli’, ‘lolling’, ‘masterbating’, ‘masturbate’, ‘masturbating’, ‘masturbation’, ‘milf’, ‘orgies’, ‘orgy’, ‘pegging’, ‘porn’, ‘pornhub’, ‘porno’, ‘pornos’, ‘pornstar’, ‘pornstars’, ‘redtube’, ‘rimming’, ‘slutty’, ‘strapon’, ‘threesome’, ‘vibrator’, ‘xhamster’, ‘xnxx’, ‘xvideos’, ‘xxx’, ‘youporn’]	list of flagged words
flagged_words_threshold	float	0.1	threshold for flagged words
get_words_func	callable	get_words	function to get words from document
dry_run	bool	False	if True, returns the ratio of flagged words
Returns	bool		returns True if document is below threshold unless dry_run is True

The check_flagged_words filter function is purposefully hidden in this documentation as it would show the flagged words directly in the documentation, which might shock some people.

assert check_flagged_words("test") == True
assert check_flagged_words("bdsm") == False

source

check_perplexity

 check_perplexity (document, perplexity_threshold=10000, model=None,
                   dry_run=False)

Checks if the document is below the perplexity threshold.

	Type	Default	Details
document			document to be analyzed
perplexity_threshold	int	10000	threshold for perplexity
model	NoneType	None	model to calculate perplexity
dry_run	bool	False	if True, returns the perplexity of the document
Returns	bool		returns True if document is below threshold

To run this test, you need to have kenlm and sentencepiece installed: pip install https://github.com/kpu/kenlm/archive/master.zip sentencepiece

from squeakily.helpers import KenlmModel

model = KenlmModel.from_pretrained(
    model_dataset="wikipedia",
    language="en",
    lower_case=True,
    remove_accents=True,
    normalize_numbers=True,
    punctuation=1,
)

low_test_str = "I am very perplexed"
high_test_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed ..."

assert check_perplexity(low_test_str, perplexity_threshold=1_000, model=model) == True
assert check_perplexity(high_test_str, perplexity_threshold=1_000, model=model) == False

/home/nathan/miniconda3/envs/squeakily/lib/python3.10/site-packages/huggingface_hub/file_download.py:592: FutureWarning: `cached_download` is the legacy way to download files from the HF hub, please consider upgrading to `hf_hub_download`
  warnings.warn(

source

check_language

 check_language (document, language='en', language_threshold=0.9,
                 model=None, dry_run=False)

Checks if the document is below the language threshold.

	Type	Default	Details
document			document to be analyzed
language	str	en	language to check
language_threshold	float	0.9	threshold for language
model	NoneType	None	model to check language
dry_run	bool	False	if True, returns the language of the document
Returns	bool		returns True if document is below threshold

from squeakily.helpers import FastTextLanguageDetector

fasttext_model = FastTextLanguageDetector.from_pretrained()

english_text = "Hi, my name is John."
spanish_text = "Hola, me llamo Juan."
chinese_text = "你好，我叫张三。"

assert check_language(english_text, language="en", language_threshold=0.85, model=fasttext_model) == True
assert check_language(spanish_text, language="en", language_threshold=0.85, model=fasttext_model) == False
assert check_language(chinese_text, language="en", language_threshold=0.85, model=fasttext_model) == False

# test dry run
assert check_language(english_text, language="en", language_threshold=0.85, model=fasttext_model, dry_run=True) > 0.
assert check_language(spanish_text, language="en", language_threshold=0.85, model=fasttext_model, dry_run=True) == -1.
assert check_language(chinese_text, language="es", language_threshold=0.85, model=fasttext_model, dry_run=True) == -1.

Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.

source

check_word_number

 check_word_number (document, min_word_threshold=5,
                    max_word_threshold=100, get_words_func=<function
                    get_words>, dry_run=False)

Checks if the document is between the minimum and maximum word thresholds.

	Type	Default	Details
document			document to be analyzed
min_word_threshold	int	5	minimum number of words
max_word_threshold	int	100	maximum number of words
get_words_func	function	get_words	function to get words from document
dry_run	bool	False	if True, returns the number of words in the document
Returns	bool		returns True if document is between the minimum and maximum thresholds

test_str = "This is a test string."

assert check_word_number(test_str, min_word_threshold=5, max_word_threshold=10) == True
assert check_word_number(test_str, min_word_threshold=1, max_word_threshold=4) == False

source

check_stop_word_ratio

 check_stop_word_ratio (document, stop_word_threshold=0.3,
                        stop_words=['a', 'a.k.a', 'aboard', 'about',
                        'above', 'abt', 'accord', 'according', 'across',
                        'after', 'against', 'ago', 'aground', 'ahead',
                        'aka', 'ala', 'albeit', 'all', 'along',
                        'alongside', 'although', 'am', 'amid', 'amidst',
                        'among', 'amongst', 'amoung', 'an', 'and',
                        'and/or', 'another', 'any', 'any1', 'anybody',
                        'anyone', 'anything', 'are', 'around', 'as',
                        'aside', 'astride', 'at', 'atop', 'away', 'b',
                        'b/c', 'b/t', 'back', 'base', 'based', 'bc', 'be',
                        'because', 'been', 'before', 'behind', 'being',
                        'below', 'beneath', 'beside', 'besides',
                        'between', 'beyond', 'board', 'both', 'btwn',
                        'but', 'by', 'can', 'cause', 'circa', 'cos',
                        'could', 'coz', 'cus', 'depend', 'depending',
                        'despite', 'did', 'do', 'does', 'down', 'due',
                        'during', 'each', 'either', 'else', 'even',
                        'ever', 'every', 'everybody', 'everyone',
                        'everything', 'except', 'for', 'forth', 'from',
                        'get', 'gets', 'getting', 'give', 'given', 'got',
                        'had', 'half', 'has', 'hav', 'have', 'having',
                        'he', 'her', 'hers', 'herself', 'him', 'himself',
                        'his', 'how', 'however', 'i', "i'd", 'if', 'in',
                        'include', 'including', 'inside', 'instead',
                        'into', 'is', 'it', "it's", 'its', 'itself',
                        'lest', 'like', 'made', 'many', 'may', 'me',
                        'might', 'mine', 'minus', 'most', 'much', 'must',
                        'my', 'myself', 'nary', 'near', 'nearby',
                        'neither', 'next', 'nigh', 'no', 'nobody', 'none',
                        'noone', 'nor', 'not', 'nothing',
                        'notwithstanding', 'of', 'off', 'on', 'onboard',
                        'once', 'one', 'ones', 'oneself', 'only', 'onto',
                        'opposite', 'or', 'other', 'others', 'ought',
                        'our', 'ours', 'ourselves', 'out', 'outside',
                        'over', 'overt', 'own', 'past', 'per', 'plus',
                        'prior', 'quite', 'rather', 're', 'regard',
                        'regarding', 'regardless', 'round', 's/he',
                        'save', 'self', 'shall', 'she', 'should', 'side',
                        'since', 'so', 'some', 'somebody', 'someone',
                        'something', 'such', 'sure', 'teh', 'than',
                        'thanks', 'that', 'the', 'their', 'theirs',
                        'them', 'themselves', 'then', 'there', 'these',
                        'they', "they're", 'thier', 'this', 'tho',
                        'those', 'thou', 'though', 'through',
                        'throughout', 'thru', 'thy', 'til', 'till', 'to',
                        'together', 'too', 'toward', 'towards', 'u',
                        'under', 'underneath', 'unless', 'unlike',
                        'until', 'unto', 'up', 'upon', 'ur', 'us', 'use',
                        'versus', 'via', 'vs', 'vs.', 'w/', 'w/o',
                        'w/out', 'was', 'we', 'were', 'what', 'whatever',
                        'whatnot', 'when', 'whenever', 'where', 'whereas',
                        'wherever', 'whether', 'which', 'while', 'whilst',
                        'whither', 'who', "who's", 'whoever', 'whom',
                        'whomever', 'whose', 'why', 'will', 'with',
                        'within', 'without', 'wo', 'worth', 'would',
                        'wud', "y'all", 'ya', 'yet', 'yo', 'you',
                        "you're", 'your', 'youre', 'yours', 'yourself',
                        'yourselves'], get_words_func=<function
                        get_words>, dry_run=False)

Checks if the document contains a high percentage of stop words.

	Type	Default	Details
document			document to be analyzed
stop_word_threshold	float	0.3	threshold for stop words
stop_words	list	[‘a’, ‘a.k.a’, ‘aboard’, ‘about’, ‘above’, ‘abt’, ‘accord’, ‘according’, ‘across’, ‘after’, ‘against’, ‘ago’, ‘aground’, ‘ahead’, ‘aka’, ‘ala’, ‘albeit’, ‘all’, ‘along’, ‘alongside’, ‘although’, ‘am’, ‘amid’, ‘amidst’, ‘among’, ‘amongst’, ‘amoung’, ‘an’, ‘and’, ‘and/or’, ‘another’, ‘any’, ‘any1’, ‘anybody’, ‘anyone’, ‘anything’, ‘are’, ‘around’, ‘as’, ‘aside’, ‘astride’, ‘at’, ‘atop’, ‘away’, ‘b’, ‘b/c’, ‘b/t’, ‘back’, ‘base’, ‘based’, ‘bc’, ‘be’, ‘because’, ‘been’, ‘before’, ‘behind’, ‘being’, ‘below’, ‘beneath’, ‘beside’, ‘besides’, ‘between’, ‘beyond’, ‘board’, ‘both’, ‘btwn’, ‘but’, ‘by’, ‘can’, ‘cause’, ‘circa’, ‘cos’, ‘could’, ‘coz’, ‘cus’, ‘depend’, ‘depending’, ‘despite’, ‘did’, ‘do’, ‘does’, ‘down’, ‘due’, ‘during’, ‘each’, ‘either’, ‘else’, ‘even’, ‘ever’, ‘every’, ‘everybody’, ‘everyone’, ‘everything’, ‘except’, ‘for’, ‘forth’, ‘from’, ‘get’, ‘gets’, ‘getting’, ‘give’, ‘given’, ‘got’, ‘had’, ‘half’, ‘has’, ‘hav’, ‘have’, ‘having’, ‘he’, ‘her’, ‘hers’, ‘herself’, ‘him’, ‘himself’, ‘his’, ‘how’, ‘however’, ‘i’, “i’d”, ‘if’, ‘in’, ‘include’, ‘including’, ‘inside’, ‘instead’, ‘into’, ‘is’, ‘it’, “it’s”, ‘its’, ‘itself’, ‘lest’, ‘like’, ‘made’, ‘many’, ‘may’, ‘me’, ‘might’, ‘mine’, ‘minus’, ‘most’, ‘much’, ‘must’, ‘my’, ‘myself’, ‘nary’, ‘near’, ‘nearby’, ‘neither’, ‘next’, ‘nigh’, ‘no’, ‘nobody’, ‘none’, ‘noone’, ‘nor’, ‘not’, ‘nothing’, ‘notwithstanding’, ‘of’, ‘off’, ‘on’, ‘onboard’, ‘once’, ‘one’, ‘ones’, ‘oneself’, ‘only’, ‘onto’, ‘opposite’, ‘or’, ‘other’, ‘others’, ‘ought’, ‘our’, ‘ours’, ‘ourselves’, ‘out’, ‘outside’, ‘over’, ‘overt’, ‘own’, ‘past’, ‘per’, ‘plus’, ‘prior’, ‘quite’, ‘rather’, ‘re’, ‘regard’, ‘regarding’, ‘regardless’, ‘round’, ‘s/he’, ‘save’, ‘self’, ‘shall’, ‘she’, ‘should’, ‘side’, ‘since’, ‘so’, ‘some’, ‘somebody’, ‘someone’, ‘something’, ‘such’, ‘sure’, ‘teh’, ‘than’, ‘thanks’, ‘that’, ‘the’, ‘their’, ‘theirs’, ‘them’, ‘themselves’, ‘then’, ‘there’, ‘these’, ‘they’, “they’re”, ‘thier’, ‘this’, ‘tho’, ‘those’, ‘thou’, ‘though’, ‘through’, ‘throughout’, ‘thru’, ‘thy’, ‘til’, ‘till’, ‘to’, ‘together’, ‘too’, ‘toward’, ‘towards’, ‘u’, ‘under’, ‘underneath’, ‘unless’, ‘unlike’, ‘until’, ‘unto’, ‘up’, ‘upon’, ‘ur’, ‘us’, ‘use’, ‘versus’, ‘via’, ‘vs’, ‘vs.’, ‘w/’, ‘w/o’, ‘w/out’, ‘was’, ‘we’, ‘were’, ‘what’, ‘whatever’, ‘whatnot’, ‘when’, ‘whenever’, ‘where’, ‘whereas’, ‘wherever’, ‘whether’, ‘which’, ‘while’, ‘whilst’, ‘whither’, ‘who’, “who’s”, ‘whoever’, ‘whom’, ‘whomever’, ‘whose’, ‘why’, ‘will’, ‘with’, ‘within’, ‘without’, ‘wo’, ‘worth’, ‘would’, ‘wud’, “y’all”, ‘ya’, ‘yet’, ‘yo’, ‘you’, “you’re”, ‘your’, ‘youre’, ‘yours’, ‘yourself’, ‘yourselves’]	list of stop words
get_words_func	function	get_words	function to get words from document
dry_run	bool	False	if True, returns the ratio of stop words in the document
Returns	bool		returns True if document is below the threshold

assert check_stop_word_ratio("test") == True
assert check_stop_word_ratio("the") == False
assert check_stop_word_ratio("the funny llama", stop_word_threshold=0.3) == False
assert check_stop_word_ratio("the funny llama", stop_word_threshold=0.5) == True
# Test french stop words
assert check_stop_word_ratio("le", stop_words=stopwords['fr']) == False
assert check_stop_word_ratio("le chien est beau", stop_words=stopwords['fr'], dry_run=True) == 0.5
assert check_stop_word_ratio("le chien est beau", stop_words=stopwords['fr'], stop_word_threshold=0.3) == False

source

check_code_parsability

 check_code_parsability (document, program_language='python')

Checks if the document contains parsable code.

	Type	Default	Details
document			document to be analyzed
program_language	str	python	programming language to check
Returns	bool		returns True if the code is parsable

# Test python code
assert check_code_parsability("print('hello world')", program_language="python") == True
assert check_code_parsability("print('hello world'", program_language="python") == False
# Test javascript code
assert check_code_parsability("console.log('hello world')", program_language="javascript") == True
assert check_code_parsability("console.log('hello world'", program_language="javascript") == False

WARNING:root:Autoloading AST parser for javascript: Start download from Github.
WARNING:root:Start cloning the parser definition from Github.
WARNING:root:Compiling language for javascript

Whole Dataset Filtering

MinHash Deduplication

The following code has all been adapted from the awesome Chenghao Mou and their work on the BigCode repository!

result = _hash_func(0, "Hello world!", num_perm=128)
assert result["__id__"] == 0
assert result["__signature__"].shape == (128,)
assert result["__signature__"].dtype == np.uint64

data = ["Hello world!", "Hello world"]
signatures = [_hash_func(i, content, num_perm=128) for i, content in enumerate(data)]
index = MinHashLSH(threshold=0.5, num_perm=128)
for signature in signatures:
    index.insert(
        signature["__id__"],
        MinHash(num_perm=128, hashvalues=signature["__signature__"], seed=MINHASH_SEED)
    )
assert _query_content(0, signatures[0]["__signature__"], index=index) == {'__neighbors__': [1], '__id__': 0}
assert _query_content(1, signatures[1]["__signature__"], index=index) == {'__neighbors__': [0], '__id__': 1}

assert _jaccard_similarity("a = 1", "a = 2") == 0.3333333333333333
assert _jaccard_similarity("a = 1", "a = 1") == 1.0

source

minhash_dedup

 minhash_dedup (ds, column, community_detection:bool=False,
                report_false_positive_rate:bool=False,
                threshold:float=0.85, num_perm:int=128,
                dry_run:bool=False)

Deduplicate the dataset using minhashing as described in the paper “Deduplicating Training Data Makes Language Models Better”.

	Type	Default	Details
ds			The dataset to deduplicate.
column			The column to use for deduplication.
community_detection	bool	False	Whether to use community detection to find the duplicate communities, or to use the connected components.
report_false_positive_rate	bool	False	Whether to report the false positive rate.
threshold	float	0.85	The threshold to use for deduplication.
num_perm	int	128	The number of permutations to use for minhashing.
dry_run	bool	False	Whether to run the deduplication in dry run mode.
Returns	Dataset

dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
dataset = dataset.select(range(1_000))
deduped_dataset = minhash_dedup(dataset, "text", community_detection=True, threshold=0.85, num_perm=128)

assert len(deduped_dataset) == len(dataset) - len(dup_ids)
assert deduped_dataset.column_names == dataset.column_names + ["__id__"]

# test dry run
deduped_dataset = minhash_dedup(dataset, "text", community_detection=True, threshold=0.85, num_perm=128, dry_run=True)

assert len(deduped_dataset) == len(dataset)
assert deduped_dataset.column_names == dataset.column_names + ["__id__", "duplicate"]

# print which records were removed
for idx in dup_ids:
    if dataset[idx]["text"] == "":
        continue
    print(dataset[idx]["text"])

 Flower Fairies of the Spring ; Blackie , 1923 

 = = = Regular season = = = 

 " There 's Got to Be a Way " ( 12 " remix ) 

 = = Early life = = 

 = = Awards = = 

 = = Critical reception = = 

 = = History = = 

 = = Service history = = 

 = = Description = = 

 = = Background = =