-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenization.py
39 lines (32 loc) · 1.39 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import nltk
import re
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
def perform_tokenization(text):
"""
Function to perform both word and sentence tokenization
Args:
text (str): input text to be tokenized
Returns:
dict: word tokens and sentence tokens
"""
# Preprocessing: Clean the text to remove excessive whitespace, handle special characters, etc.
text = text.strip() # Remove leading and trailing whitespace
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
# Sentence Tokenization
sentence_tokens = nltk.sent_tokenize(text)
# Word Tokenization
word_tokens = nltk.word_tokenize(text)
# Advanced Tokenization: Customize tokenizing for certain patterns (e.g., dates, numbers, etc.)
word_tokens = [token for token in word_tokens if token not in nltk.corpus.stopwords.words('english')] # Remove stopwords
return {
'word_tokens': word_tokens,
'sentence_tokens': sentence_tokens
}
# # Test the function
# if __name__ == "__main__":
# text = "This is a test. I have a sentence like: '2024-11-07' or URLs like https://example.com!"
# result = perform_tokenization(text)
# print("Word Tokens:", result['word_tokens'])
# print("Sentence Tokens:", result['sentence_tokens'])