Skip to content
Snippets Groups Projects
Commit 37bbde7b authored by Babali's avatar Babali
Browse files

Add text preprocessing

parent 6a849f5e
Branches main
No related tags found
No related merge requests found
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_text(text):
# Lowercasing
text = str(text).lower()
# Removing special characters
text = re.sub(r'[^\w\s]', '', text)
# Tokenization
tokens = nltk.word_tokenize(text)
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words]
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
# Joining tokens back into a sentence
preprocessed_text = ' '.join(stemmed_tokens)
return preprocessed_text
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment