-
Notifications
You must be signed in to change notification settings - Fork 0
/
pos_tagging.py
41 lines (33 loc) · 1.31 KB
/
pos_tagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import nltk
import re
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
def perform_pos_tagging(text):
"""
Function to perform POS tagging with preprocessing to improve accuracy.
Args:
text (str): input text for POS tagging
Returns:
list: list of tuples with word and its POS tag
"""
# Step 1: Preprocessing
text = text.strip()
text = re.sub(r'\s+', ' ', text)
# Remove URLs, emails, and other unwanted entities
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
text = re.sub(r'\S+@\S+', '', text) # Remove email addresses
# Step 2: Tokenize text
tokens = nltk.word_tokenize(text)
# Step 3: Remove stopwords (commonly ignored in many NLP tasks)
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]
# Step 4: POS tagging using NLTK's built-in tagger
pos_tags = nltk.pos_tag(tokens)
return pos_tags
# Test the function
# if __name__ == "__main__":
# text = "The quick brown fox jumps over the lazy dog. Check out this link: https://example.com!"
# result = perform_pos_tagging(text)
# print("POS Tags:", result)