# test the normalize_whitespace function
assert normalize_whitespace("a b c d e f g h ijk") == "a b c d e f g h i j k"
clean
normalize_whitespace
normalize_whitespace (text:str)
Replace the various whitespace characters with the standard one.
Type | Details | |
---|---|---|
text | str | The text to normalize |
Returns | str | The normalized text |
normalize_punctuation
normalize_punctuation (text:str)
Replace the various unicode punctuation characters with the standard ones.
Type | Details | |
---|---|---|
text | str | The text to normalize |
Returns | str | The normalized text |
# test the normalize_punctuation function
= ",。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►"
text
assert normalize_punctuation(text) == ",.,\"\"\"\"\"\"\"\"\"\"\'::?!();- - . ~\'...-<>[]%-"
remove_empty_lines
remove_empty_lines (text:str)
Remove empty lines from the text. Solution from https://stackoverflow.com/a/3711884/5768407
Type | Details | |
---|---|---|
text | str | The text to remove empty lines from |
Returns | str | The text with empty lines removed |
# test the remove_empty_lines function
= "\nfoo\nbar"
starts_with_newline = "foo\n\nbar"
multiple_newlines = "foo\nbar\n"
ends_with_newline
assert remove_empty_lines(starts_with_newline) == "foo\nbar"
assert remove_empty_lines(multiple_newlines) == "foo\nbar"
assert remove_empty_lines(ends_with_newline) == "foo\nbar"
replace_urls
replace_urls (text:str, dummy:str='https://example.com/')
Replace urls from text with a dummy.
Type | Default | Details | |
---|---|---|---|
text | str | The text to replace URLs in | |
dummy | str | https://example.com/ | The dummy text to replace URLs with |
Returns | str | The text with URLs replaced |
# test the replace_urls function
= "foo http://bar.com"
url_after_space = "http://foo.com bar"
url_before_space assert replace_urls(url_after_space) == "foo https://example.com/"
assert replace_urls(url_before_space) == "https://example.com/ bar"
replace_dates
replace_dates (text:str, dummy:str='2007-12-23')
Replace dates from text with a dummy.
Type | Default | Details | |
---|---|---|---|
text | str | The text to remove dates from | |
dummy | str | 2007-12-23 | The dummy text to replace dates with |
Returns | str | The text with dates replaced |
# test the replace_dates function
= "foo 1/1/2020"
date_after_space = "1/1/2020 bar"
date_before_space assert replace_dates(date_after_space, "1/1/1970") == "foo 1/1/1970"
assert replace_dates(date_before_space, "1/1/1970") == "1/1/1970 bar"
PII Removal
Currently, we support the following PII removal options:
However, for emails, phone numbers, credit cards, and SSNs, we recommend you to use the scrubadub library.
replace_email
replace_email (text:str, dummy:str='gaustin@example.org')
Replace email addresses from text with a dummy.
Type | Default | Details | |
---|---|---|---|
text | str | The text to replace email addresses in | |
dummy | str | gaustin@example.org | The dummy text to replace email addresses with |
Returns | str | The text with email addresses replaced |
# test the replace_email function
= "foo fake@email.com"
email_after_space = "fake@email.com bar"
email_before_space = "foo.bar@email.com"
email_with_forward_periods = "foo@bar.email.com"
email_with_backward_periods
assert replace_email(email_after_space, "example@email.com") == "foo example@email.com"
assert replace_email(email_before_space, "example@email.com") == "example@email.com bar"
assert replace_email(email_with_forward_periods, "example@email.com") == "example@email.com"
assert replace_email(email_with_backward_periods, "example@email.com") == "example@email.com"
replace_phone
replace_phone (text:str, dummy:str='267.517.3897')
Replace phone numbers from text with a dummy.
Type | Default | Details | |
---|---|---|---|
text | str | The text to replace phone numbers in | |
dummy | str | 267.517.3897 | The dummy text to replace phone numbers with |
Returns | str | The text with phone numbers replaced |
# test the replace_phone function
= "foo 111-222-3333"
phone_after_space = "111-222-3333 bar"
phone_before_space = "(111) 222-3333"
phone_with_parens = "111 222 3333"
phone_with_spaces = "111-222-3333"
phone_with_dashes
assert replace_phone(phone_after_space, "123-456-7890") == "foo 123-456-7890"
assert replace_phone(phone_before_space, "123-456-7890") == "123-456-7890 bar"
assert replace_phone(phone_with_parens, "123-456-7890") == "123-456-7890"
assert replace_phone(phone_with_spaces, "123-456-7890") == "123-456-7890"
assert replace_phone(phone_with_dashes, "123-456-7890") == "123-456-7890"
replace_ip
replace_ip (text, dummy1:str='84.107.244.23', dummy2:str='db2c:7ab5:1955:85ff:dfcd:786a:b58b:afe0')
Replace ip addresses from text with a dummy. Solution from https://github.com/bigcode-project/bigcode-analysis/blob/main/data_analysis/pii/utils/emails_ip_addresses_detection.py#L48
Type | Default | Details | |
---|---|---|---|
text | The text to replace ip addresses in | ||
dummy1 | str | 84.107.244.23 | The dummy text to replace ipv4 addresses with |
dummy2 | str | db2c:7ab5:1955:85ff:dfcd:786a:b58b:afe0 | The dummy text to replace ipv6 addresses with |
Returns | str | The text with ip addresses replaced |
# test the replace_ip function
= "foo 111.222.3.4"
ip4_after_space = "111.222.3.4 bar"
ip4_before_space = "2001:0db8:0000:0000:0000:8a2e:0370:7334"
ip6_with_colons
assert replace_ip(ip4_after_space, "127.0.0.1") == "foo 127.0.0.1"
assert replace_ip(ip4_before_space, "127.0.0.1") == "127.0.0.1 bar"
assert replace_ip(ip6_with_colons, "127.0.0.1", "0:0:0:0:0:0:0:1") == "0:0:0:0:0:0:0:1"
replace_credit_card
replace_credit_card (text:str, dummy:str='180069843287712')
Replace credit card numbers from text with a dummy.
Type | Default | Details | |
---|---|---|---|
text | str | The text to replace credit card numbers in | |
dummy | str | 180069843287712 | The dummy text to replace credit card numbers with |
Returns | str | The text with credit card numbers replaced |
# test the replace_credit_card function
= "foo 1111-2222-3333-4444"
credit_card_after_space = "1111-2222-3333-4444 bar"
credit_card_before_space
assert replace_credit_card(credit_card_after_space, "1234-5678-9012-3456") == "foo 1234-5678-9012-3456"
assert replace_credit_card(credit_card_before_space, "1234-5678-9012-3456") == "1234-5678-9012-3456 bar"
replace_ssn
replace_ssn (text:str, dummy:str='577-88-7519')
Replace social security numbers from text with a dummy.
Type | Default | Details | |
---|---|---|---|
text | str | The text to replace social security numbers in | |
dummy | str | 577-88-7519 | The dummy text to replace social security numbers with |
Returns | str | The text with social security numbers replaced |
# test the replace_ssn function
= "foo 111-22-3333"
ssn_after_space = "111-22-3333 bar"
ssn_before_space
assert replace_ssn(ssn_after_space, "123-45-6789") == "foo 123-45-6789"
assert replace_ssn(ssn_before_space, "123-45-6789") == "123-45-6789 bar"
fix_utf8_encoding
fix_utf8_encoding (text:str)
Fix utf8 text using ftfy.
Type | Details | |
---|---|---|
text | str | The text to fix |
Returns | str | The fixed text |
# test the fix_utf8_encoding function
= '✔ No problems'
bad_text assert fix_utf8_encoding(bad_text) == '✔ No problems'
= 'déjà vu'
bad_text assert fix_utf8_encoding(bad_text) == 'déjà vu'
= 'é'
bad_text assert fix_utf8_encoding(bad_text) == 'é'
= 'P&EACUTE;REZ'
bad_text assert fix_utf8_encoding(bad_text) == 'PÉREZ'