Text Cleaner

import re

import string

import unicodedata

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup

import nltk

# Download NLTK data

nltk.download('punkt')

nltk.download('stopwords')

nltk.download('wordnet')

# Initialize tools

lemmatizer = WordNetLemmatizer()

stopword_set = set(stopwords.words('english'))

# Unicode-safe whitespace cleaner

def clean_whitespace(text):

# Replace all Unicode space characters (category 'Z') with space

text = ''.join(

' ' if unicodedata.category(char).startswith('Z') else char

for char in text

)

# Remove zero-width and invisible characters

text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)

# Collapse all types of whitespace

text = re.sub(r'\s+', ' ', text).strip()

return text

# Main preprocessing function

def clean_text(text):

# Convert to lowercase and strip surrounding whitespace

text = str(text).lower().strip()

# Remove HTML tags

text = BeautifulSoup(text, "html.parser").get_text()

# Remove URLs

text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

# Remove non-ASCII characters

text = re.sub(r'[^\x00-\x7F]+', ' ', text)

# Remove numbers

text = re.sub(r'\d+', ' ', text)

# Replace multiple periods with a space

text = re.sub(r'\.{2,}', ' ', text)

# Remove punctuation

text = text.translate(str.maketrans('', '', string.punctuation))

# Clean up any strange or extra whitespace before tokenizing

text = clean_whitespace(text)

# Tokenize

tokens = word_tokenize(text)

# Lemmatize and remove stopwords

cleaned_tokens = [

lemmatizer.lemmatize(tok)

for tok in tokens

if tok not in stopword_set and tok.strip()

]

# Rejoin and final whitespace cleanup

cleaned_text = ''.join(cleaned_tokens)

cleaned_text = clean_whitespace(cleaned_text)

return cleaned_text

Page updated

Google Sites

Report abuse