From 43e39eb721928f8ebd7c161e245e2b916f0689f2 Mon Sep 17 00:00:00 2001 From: UTSAV SINGHAL Date: Sun, 21 Jul 2024 14:13:27 +0530 Subject: [PATCH 1/6] Add files via upload --- NLP/Algorithms/TF-IDF/tf-idf.ipynb | 1 + NLP/Algorithms/Word2Vec/word2vec.ipynb | 1 + 2 files changed, 2 insertions(+) create mode 100644 NLP/Algorithms/TF-IDF/tf-idf.ipynb create mode 100644 NLP/Algorithms/Word2Vec/word2vec.ipynb diff --git a/NLP/Algorithms/TF-IDF/tf-idf.ipynb b/NLP/Algorithms/TF-IDF/tf-idf.ipynb new file mode 100644 index 0000000..e0a3dfa --- /dev/null +++ b/NLP/Algorithms/TF-IDF/tf-idf.ipynb @@ -0,0 +1 @@ +{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30746,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"#### The TF-IDF (Term Frequency-Inverse Document Frequency) algorithm is used to convert a collection of text documents into a matrix of TF-IDF features. It is commonly used in text mining and information retrieval to reflect the importance of a word in a document relative to a collection of documents.","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"code","source":"import math\nfrom collections import Counter\n\nclass TFIDF:\n def __init__(self):\n self.vocabulary = {} # Vocabulary to store word indices\n self.idf_values = {} # IDF values for words\n\n def fit(self, documents):\n \"\"\"\n Compute IDF values based on the provided documents.\n \n Args:\n documents (list of str): List of documents where each document is a string.\n \"\"\"\n doc_count = len(documents)\n term_doc_count = Counter() # To count the number of documents containing each word\n\n # Count occurrences of words in documents\n for doc in documents:\n words = set(doc.split()) # Unique words in the current document\n for word in words:\n term_doc_count[word] += 1\n\n # Compute IDF values\n self.idf_values = {\n word: math.log(doc_count / (count + 1)) # +1 to avoid division by zero\n for word, count in term_doc_count.items()\n }\n\n # Build vocabulary\n self.vocabulary = {word: idx for idx, word in enumerate(self.idf_values.keys())}\n\n def transform(self, documents):\n \"\"\"\n Transform documents into TF-IDF representation.\n\n Args:\n documents (list of str): List of documents where each document is a string.\n \n Returns:\n list of list of float: TF-IDF matrix where each row corresponds to a document.\n \"\"\"\n rows = []\n for doc in documents:\n words = doc.split()\n word_count = Counter(words)\n doc_length = len(words)\n row = [0] * len(self.vocabulary)\n\n for word, count in word_count.items():\n if word in self.vocabulary:\n tf = count / doc_length\n idf = self.idf_values[word]\n index = self.vocabulary[word]\n row[index] = tf * idf\n rows.append(row)\n return rows\n\n def fit_transform(self, documents):\n \"\"\"\n Compute IDF values and transform documents into TF-IDF representation.\n\n Args:\n documents (list of str): List of documents where each document is a string.\n\n Returns:\n list of list of float: TF-IDF matrix where each row corresponds to a document.\n \"\"\"\n self.fit(documents)\n return self.transform(documents)","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:08:08.207148Z","iopub.execute_input":"2024-07-20T10:08:08.207645Z","iopub.status.idle":"2024-07-20T10:08:08.222510Z","shell.execute_reply.started":"2024-07-20T10:08:08.207605Z","shell.execute_reply":"2024-07-20T10:08:08.221404Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"# Example usage\nif __name__ == \"__main__\":\n documents = [\n \"the cat sat on the mat\",\n \"the dog ate my homework\",\n \"the cat ate the dog food\"\n ]\n\n tfidf = TFIDF()\n tfidf_matrix = tfidf.fit_transform(documents)\n for i, row in enumerate(tfidf_matrix):\n print(f\"Document {i}: {row}\")","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:08:10.692831Z","iopub.execute_input":"2024-07-20T10:08:10.693205Z","iopub.status.idle":"2024-07-20T10:08:10.699967Z","shell.execute_reply.started":"2024-07-20T10:08:10.693178Z","shell.execute_reply":"2024-07-20T10:08:10.698625Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"Document 0: [0.0, -0.09589402415059363, 0.06757751801802739, 0.06757751801802739, 0.06757751801802739, 0, 0, 0, 0, 0]\nDocument 1: [0, -0.05753641449035618, 0, 0, 0, 0.08109302162163289, 0.08109302162163289, 0.0, 0.0, 0]\nDocument 2: [0.0, -0.09589402415059363, 0, 0, 0, 0, 0, 0.0, 0.0, 0.06757751801802739]\n","output_type":"stream"}]},{"cell_type":"code","source":"# Additional example usage\nif __name__ == \"__main__\":\n # Sample documents\n documents = [\n \"I love programming in Python\",\n \"Machine learning is fun\",\n \"Python is a versatile language\",\n \"Learning new skills is always beneficial\"\n ]\n\n # Initialize the TF-IDF model\n tfidf = TFIDF()\n \n # Fit the model and transform the documents\n tfidf_matrix = tfidf.fit_transform(documents)\n \n # Print the vocabulary\n print(\"Vocabulary:\", tfidf.vocabulary)\n \n # Print the TF-IDF representation\n print(\"TF-IDF Representation:\")\n for i, vector in enumerate(tfidf_matrix):\n print(f\"Document {i + 1}: {vector}\")\n\n # More example documents with mixed content\n more_documents = [\n \"the quick brown fox jumps over the lazy dog\",\n \"a journey of a thousand miles begins with a single step\",\n \"to be or not to be that is the question\",\n \"the rain in Spain stays mainly in the plain\",\n \"all human beings are born free and equal in dignity and rights\"\n ]\n\n # Fit the model and transform the new set of documents\n tfidf_more = TFIDF()\n tfidf_matrix_more = tfidf_more.fit_transform(more_documents)\n \n # Print the vocabulary for the new documents\n print(\"\\nVocabulary for new documents:\", tfidf_more.vocabulary)\n \n # Print the TF-IDF representation for the new documents\n print(\"TF-IDF Representation for new documents:\")\n for i, vector in enumerate(tfidf_matrix_more):\n print(f\"Document {i + 1}: {vector}\")","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:09:51.105985Z","iopub.execute_input":"2024-07-20T10:09:51.107160Z","iopub.status.idle":"2024-07-20T10:09:51.118181Z","shell.execute_reply.started":"2024-07-20T10:09:51.107108Z","shell.execute_reply":"2024-07-20T10:09:51.116972Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"Vocabulary: {'love': 0, 'I': 1, 'Python': 2, 'programming': 3, 'in': 4, 'learning': 5, 'fun': 6, 'Machine': 7, 'is': 8, 'a': 9, 'language': 10, 'versatile': 11, 'Learning': 12, 'beneficial': 13, 'new': 14, 'always': 15, 'skills': 16}\nTF-IDF Representation:\nDocument 1: [0.13862943611198905, 0.13862943611198905, 0.05753641449035617, 0.13862943611198905, 0.13862943611198905, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 2: [0, 0, 0, 0, 0, 0.17328679513998632, 0.17328679513998632, 0.17328679513998632, 0.0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 3: [0, 0, 0.05753641449035617, 0, 0, 0, 0, 0, 0.0, 0.13862943611198905, 0.13862943611198905, 0.13862943611198905, 0, 0, 0, 0, 0]\nDocument 4: [0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0, 0.11552453009332421, 0.11552453009332421, 0.11552453009332421, 0.11552453009332421, 0.11552453009332421]\n\nVocabulary for new documents: {'brown': 0, 'fox': 1, 'quick': 2, 'over': 3, 'the': 4, 'lazy': 5, 'dog': 6, 'jumps': 7, 'thousand': 8, 'journey': 9, 'single': 10, 'a': 11, 'step': 12, 'with': 13, 'of': 14, 'miles': 15, 'begins': 16, 'to': 17, 'or': 18, 'question': 19, 'not': 20, 'be': 21, 'that': 22, 'is': 23, 'Spain': 24, 'rain': 25, 'mainly': 26, 'plain': 27, 'stays': 28, 'in': 29, 'human': 30, 'and': 31, 'all': 32, 'born': 33, 'equal': 34, 'dignity': 35, 'are': 36, 'rights': 37, 'beings': 38, 'free': 39}\nTF-IDF Representation for new documents:\nDocument 1: [0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.049587455847602165, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 2: [0, 0, 0, 0, 0, 0, 0, 0, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0.249897472329315, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 3: [0, 0, 0, 0, 0.02231435513142098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.18325814637483104, 0.09162907318741552, 0.09162907318741552, 0.09162907318741552, 0.18325814637483104, 0.09162907318741552, 0.09162907318741552, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 4: [0, 0, 0, 0, 0.049587455847602165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.11351680528133126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 5: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.04256880198049923, 0.07635756098951292, 0.15271512197902584, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292]\n","output_type":"stream"}]},{"cell_type":"markdown","source":"#### Explanation:\n\n1. **Initialization**:\n - `self.vocabulary`: Dictionary to store the mapping of words to their indices in the TF-IDF matrix.\n - `self.idf_values`: Dictionary to store the IDF (Inverse Document Frequency) values for each word.\n\n2. **`fit` Method**:\n - **Input**: List of documents.\n - **Purpose**: Calculate the IDF values for all unique words in the corpus.\n - **Steps**:\n 1. Count the number of documents containing each word.\n 2. Compute the IDF for each word using the formula:\n $$\n \\text{IDF}(word) = \\log \\left(\\frac{\\text{Total number of documents}}{\\text{Number of documents containing the word} + 1}\\right)\n $$\n Adding 1 avoids division by zero.\n 3. Build the vocabulary with word-to-index mapping.\n\n3. **`transform` Method**:\n - **Input**: List of documents.\n - **Purpose**: Convert each document into a TF-IDF representation.\n - **Steps**:\n 1. Compute Term Frequency (TF) for each word in the document:\n $$\n \\text{TF} = \\frac{\\text{Count of the word}}{\\text{Total number of words in the document}}\n $$\n 2. Compute the TF-IDF value:\n $$\n \\text{TF-IDF} = \\text{TF} \\times \\text{IDF}\n $$\n 3. Store the TF-IDF values in a matrix where each row corresponds to a document.\n\n4. **`fit_transform` Method**:\n - **Purpose**: Perform both fitting (computing IDF values) and transforming (converting documents to TF-IDF representation) in one step.","metadata":{}}]} \ No newline at end of file diff --git a/NLP/Algorithms/Word2Vec/word2vec.ipynb b/NLP/Algorithms/Word2Vec/word2vec.ipynb new file mode 100644 index 0000000..cb909d2 --- /dev/null +++ b/NLP/Algorithms/Word2Vec/word2vec.ipynb @@ -0,0 +1 @@ +{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30746,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### What is Word2Vec?\n\n**Word2Vec** is a technique to learn word embeddings using neural networks. The primary goal is to represent words in a continuous vector space where semantically similar words are mapped to nearby points. Word2Vec can be implemented using two main architectures:\n\n1. **Continuous Bag of Words (CBOW)**: Predicts the target word based on the context words (surrounding words).\n2. **Skip-gram**: Predicts the context words based on a given target word.\n\nIn this example, we'll focus on the Skip-gram approach, which is more commonly used in practice. The Skip-gram model tries to maximize the probability of context words given a target word.","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"code","source":"import numpy as np\n\nclass Word2Vec:\n def __init__(self, window_size=2, embedding_dim=10, learning_rate=0.01):\n # Initialize parameters\n self.window_size = window_size\n self.embedding_dim = embedding_dim\n self.learning_rate = learning_rate\n self.vocabulary = {}\n self.word_index = {}\n self.index_word = {}\n self.W1 = None\n self.W2 = None\n\n def tokenize(self, documents):\n # Tokenize documents and build vocabulary\n vocabulary = set()\n for doc in documents:\n words = doc.split()\n vocabulary.update(words)\n \n self.vocabulary = list(vocabulary)\n self.word_index = {word: idx for idx, word in enumerate(self.vocabulary)}\n self.index_word = {idx: word for idx, word in enumerate(self.vocabulary)}\n\n def generate_training_data(self, documents):\n # Generate training data for the Skip-gram model\n training_data = []\n for doc in documents:\n words = doc.split()\n for idx, word in enumerate(words):\n target_word = self.word_index[word]\n context = [self.word_index[words[i]] for i in range(max(0, idx - self.window_size), min(len(words), idx + self.window_size + 1)) if i != idx]\n for context_word in context:\n training_data.append((target_word, context_word))\n return training_data\n\n def train(self, documents, epochs=1000):\n # Tokenize the documents and generate training data\n self.tokenize(documents)\n training_data = self.generate_training_data(documents)\n \n # Initialize weight matrices with random values\n vocab_size = len(self.vocabulary)\n self.W1 = np.random.uniform(-1, 1, (vocab_size, self.embedding_dim))\n self.W2 = np.random.uniform(-1, 1, (self.embedding_dim, vocab_size))\n \n for epoch in range(epochs):\n loss = 0\n for target_word, context_word in training_data:\n # Forward pass\n h = self.W1[target_word] # Hidden layer representation of the target word\n u = np.dot(h, self.W2) # Output layer scores\n y_pred = self.softmax(u) # Predicted probabilities\n \n # Calculate error\n e = np.zeros(vocab_size)\n e[context_word] = 1\n error = y_pred - e\n \n # Backpropagation\n self.W1[target_word] -= self.learning_rate * np.dot(self.W2, error)\n self.W2 -= self.learning_rate * np.outer(h, error)\n \n # Calculate loss (cross-entropy)\n loss -= np.log(y_pred[context_word])\n \n if (epoch + 1) % 100 == 0:\n print(f'Epoch {epoch + 1}, Loss: {loss}')\n\n def softmax(self, x):\n # Softmax function to convert scores into probabilities\n e_x = np.exp(x - np.max(x))\n return e_x / e_x.sum(axis=0)\n\n def get_word_vector(self, word):\n # Retrieve the vector representation of a word\n return self.W1[self.word_index[word]]\n\n def get_vocabulary(self):\n # Retrieve the vocabulary\n return self.vocabulary","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:18:01.791652Z","iopub.execute_input":"2024-07-20T10:18:01.792119Z","iopub.status.idle":"2024-07-20T10:18:01.808809Z","shell.execute_reply.started":"2024-07-20T10:18:01.792084Z","shell.execute_reply":"2024-07-20T10:18:01.807756Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"# Example usage\nif __name__ == \"__main__\":\n # Basic example usage\n documents = [\n \"the cat sat on the mat\",\n \"the dog ate my homework\",\n \"the cat ate the dog food\"\n ]\n\n word2vec = Word2Vec()\n word2vec.train(documents)\n\n # Getting the word vector for 'cat'\n word_vector = word2vec.get_word_vector('cat')\n print(\"Vector for 'cat':\", word_vector)","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:14:57.090284Z","iopub.execute_input":"2024-07-20T10:14:57.091266Z","iopub.status.idle":"2024-07-20T10:14:58.587081Z","shell.execute_reply.started":"2024-07-20T10:14:57.091227Z","shell.execute_reply":"2024-07-20T10:14:58.586006Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"Epoch 100, Loss: 75.35100287508789\nEpoch 200, Loss: 72.30321253446704\nEpoch 300, Loss: 71.89275194982119\nEpoch 400, Loss: 71.8635105655411\nEpoch 500, Loss: 71.92973913785089\nEpoch 600, Loss: 72.0327548745859\nEpoch 700, Loss: 72.15655555085135\nEpoch 800, Loss: 72.29658835738203\nEpoch 900, Loss: 72.45216981302312\nEpoch 1000, Loss: 72.62419809127482\nVector for 'cat': [-0.07713173 -1.51443462 -0.67274315 0.31749278 0.37492043 -0.29362281\n 0.83514656 -0.73169726 -0.60048742 1.68112817]\n","output_type":"stream"}]},{"cell_type":"code","source":"# Additional example usage\nif __name__ == \"__main__\":\n # Sample documents\n documents = [\n \"I love programming in Python\",\n \"Machine learning is fun\",\n \"Python is a versatile language\",\n \"Learning new skills is always beneficial\"\n ]\n\n # Initialize and train the Word2Vec model\n word2vec = Word2Vec()\n word2vec.train(documents)\n\n # Print the vocabulary\n print(\"Vocabulary:\", word2vec.get_vocabulary())\n\n # Print the word vectors for each word in the vocabulary\n print(\"Word Vectors:\")\n for word in word2vec.get_vocabulary():\n vector = word2vec.get_word_vector(word)\n print(f\"Vector for '{word}':\", vector)\n\n # More example documents with mixed content\n more_documents = [\n \"the quick brown fox jumps over the lazy dog\",\n \"a journey of a thousand miles begins with a single step\",\n \"to be or not to be that is the question\",\n \"the rain in Spain stays mainly in the plain\",\n \"all human beings are born free and equal in dignity and rights\"\n ]\n\n # Initialize and train the Word2Vec model on new documents\n word2vec_more = Word2Vec()\n word2vec_more.train(more_documents)\n\n # Print the word vectors for selected words\n print(\"\\nWord Vectors for new documents:\")\n for word in ['quick', 'journey', 'be', 'rain', 'human']:\n vector = word2vec_more.get_word_vector(word)\n print(f\"Vector for '{word}':\", vector)","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:18:04.093039Z","iopub.execute_input":"2024-07-20T10:18:04.093849Z","iopub.status.idle":"2024-07-20T10:18:11.208990Z","shell.execute_reply.started":"2024-07-20T10:18:04.093806Z","shell.execute_reply":"2024-07-20T10:18:11.207981Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"Epoch 100, Loss: 87.29077775759023\nEpoch 200, Loss: 79.18780389421288\nEpoch 300, Loss: 77.94910795980299\nEpoch 400, Loss: 77.66245836714485\nEpoch 500, Loss: 77.60853401053721\nEpoch 600, Loss: 77.63645020161269\nEpoch 700, Loss: 77.70072895043553\nEpoch 800, Loss: 77.78464829390958\nEpoch 900, Loss: 77.88140561618562\nEpoch 1000, Loss: 77.98820209880577\nVocabulary: ['love', 'learning', 'Learning', 'a', 'beneficial', 'new', 'Machine', 'fun', 'always', 'I', 'language', 'Python', 'programming', 'skills', 'versatile', 'is', 'in']\nWord Vectors:\nVector for 'love': [ 1.25611249 0.72399226 0.73932742 -1.1793396 -0.03000625 0.76406502\n -1.51633466 -0.21015759 -0.5542326 -0.18235466]\nVector for 'learning': [-0.05385896 -1.602021 1.4943953 0.44231527 -0.07299037 -0.74754454\n -1.3903911 1.00685072 0.4544704 -1.36141874]\nVector for 'Learning': [-1.74465271 -0.68294311 -1.5425367 -0.05822687 -0.26804989 -0.1100379\n 0.64812036 0.93995388 0.00906527 0.51813513]\nVector for 'a': [ 0.42309774 0.36851908 -1.21744857 0.03453108 0.56061874 0.66453021\n 1.48116814 -1.41757756 1.6581559 1.24310819]\nVector for 'beneficial': [-0.64162578 -0.05537255 1.84063196 1.11458998 -0.18536477 -0.51397477\n 1.1567393 -0.08573849 1.02205477 -0.10391583]\nVector for 'new': [-0.84236249 -1.1861667 0.47188798 -0.90436565 -1.54968604 0.26117848\n 0.31966193 0.60843338 1.44536674 0.45516902]\nVector for 'Machine': [ 0.56576103 -0.22504546 0.37463336 2.0133637 -0.98459966 -0.09227832\n 0.09937559 0.87758714 0.33328967 0.62105041]\nVector for 'fun': [ 0.74045829 -0.10269314 0.38566757 1.49587936 -0.46728908 0.6369226\n 0.35471937 1.21234412 0.62831945 1.05840841]\nVector for 'always': [-0.50630359 -1.0742406 -1.41055437 0.88033565 -1.75850862 0.06157641\n -0.07544321 -0.34542803 1.01462117 -0.42767684]\nVector for 'I': [ 0.08341753 0.90589692 -0.11179104 0.21613275 -1.58854584 -1.02474237\n -1.15777817 -1.53751946 -0.30385788 1.19445792]\nVector for 'language': [-0.14495823 0.89041417 -0.2826522 0.55803733 1.67530985 -1.50716127\n 0.44839956 -0.75002047 0.19454055 0.41988745]\nVector for 'Python': [ 1.3746919 -0.69313117 1.20280712 0.33296481 1.0498198 -0.30617252\n -0.68207893 -1.63528049 0.11746197 0.18804705]\nVector for 'programming': [ 0.88953518 0.3960943 0.08324377 -0.81845185 0.45232069 0.50849579\n 0.31470319 -0.28788132 -2.10671769 0.9837999 ]\nVector for 'skills': [-0.61114903 -1.23867723 0.67501036 -0.20070013 0.45251135 1.06565594\n 1.98702809 0.97876004 0.29505154 0.0113592 ]\nVector for 'versatile': [ 0.6889747 -0.8286536 -0.93369097 0.45645706 1.77826296 -0.11065061\n -0.17642369 -0.38036503 1.33092187 -0.6485383 ]\nVector for 'is': [-1.42321806 1.03466665 -0.45712698 1.15218139 0.04484756 -0.21603108\n 0.33568069 0.13245389 -0.43471618 -0.76258933]\nVector for 'in': [ 1.22477025 0.09302424 0.48840564 0.34210232 -1.54938386 0.06713506\n 0.48047235 -1.5520969 -1.28586721 0.29676856]\nEpoch 100, Loss: 295.9373094225889\nEpoch 200, Loss: 274.5912114033107\nEpoch 300, Loss: 271.589257628458\nEpoch 400, Loss: 270.94384683000214\nEpoch 500, Loss: 270.887818905955\nEpoch 600, Loss: 271.04736753192935\nEpoch 700, Loss: 271.3131621354528\nEpoch 800, Loss: 271.648120274235\nEpoch 900, Loss: 272.0401492641082\nEpoch 1000, Loss: 272.4875456710171\n\nWord Vectors for new documents:\nVector for 'quick': [ 0.84963171 -0.40584142 0.17137242 -1.8239235 1.30300479 1.18797836\n 0.83016292 0.34557914 -0.67271966 -1.01562793]\nVector for 'journey': [-0.33640099 0.42400908 -1.28853498 1.11235157 -0.68733637 -1.4828937\n 1.16960258 1.64052887 -0.04964284 0.26274727]\nVector for 'be': [-1.29584666 1.08229277 1.21521283 -0.76696754 0.70692083 0.58232736\n -0.28705896 -1.90282741 0.43638555 0.51532593]\nVector for 'rain': [-1.75428039 -1.2910116 -1.03591684 -0.72590607 -0.19088637 1.23074524\n -0.7232419 0.01864839 1.13582975 -0.6059139 ]\nVector for 'human': [ 0.89800051 0.99213515 -1.3657362 -0.33661478 0.05095791 1.71125124\n -0.48803223 -0.31073697 0.51263003 1.35208204]\n","output_type":"stream"}]},{"cell_type":"markdown","source":"#### Explanation of the Code\n\n1. **Initialization**:\n - `window_size`: Defines the size of the context window around the target word.\n - `embedding_dim`: Dimension of the word vectors (embedding space).\n - `learning_rate`: Rate at which weights are updated.\n\n2. **Tokenization**:\n - The `tokenize` method creates a vocabulary from the documents and builds mappings between words and their indices.\n\n3. **Generate Training Data**:\n - The `generate_training_data` method creates pairs of target words and context words based on the window size.\n\n4. **Training**:\n - The `train` method initializes the weight matrices and updates them using gradient descent.\n - For each word-context pair, it computes the hidden layer representation, predicts context probabilities, calculates the error, and updates the weights.\n\n5. **Softmax Function**:\n - The `softmax` function converts the output layer scores into probabilities, which are used to compute the error and update the weights.\n\n6. **Retrieve Word Vector**:\n - The `get_word_vector` method retrieves the embedding of a specific word.","metadata":{}}]} \ No newline at end of file From 88aad0aaee7b65a6310009c7852f4de599ba6d2b Mon Sep 17 00:00:00 2001 From: UTSAV SINGHAL Date: Sun, 21 Jul 2024 14:15:24 +0530 Subject: [PATCH 2/6] TF-IDF and Word2Vec --- NLP/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NLP/README.md b/NLP/README.md index 16337cc..80ff8ef 100644 --- a/NLP/README.md +++ b/NLP/README.md @@ -6,7 +6,7 @@ | S.No | Algorithm | S.No. | Algorithm | S.No. | Algorithm | |-------|-----------|-------|-----------|-------|-----------| -| 1 | [Bag of Words](./Algorithms/BagOfWords) | 2 | | 3 | | +| 1 | [Bag of Words](./Algorithms/BagOfWords) | 2 | [TF-IDF](./Algorithms/TF-IDF/tf-idf.ipynb) | 3 | [Word2Vec](./Algorithms/Word2Vec/word2vec.ipynb) | | 4 | | 5 | | 6 | | ## Available Documentations From d128d83aa1c71da2c35c1fb04d597ce17007e425 Mon Sep 17 00:00:00 2001 From: UTSAV SINGHAL Date: Wed, 24 Jul 2024 11:41:03 +0530 Subject: [PATCH 3/6] Delete NLP/Algorithms/Word2Vec/word2vec.ipynb --- NLP/Algorithms/Word2Vec/word2vec.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 NLP/Algorithms/Word2Vec/word2vec.ipynb diff --git a/NLP/Algorithms/Word2Vec/word2vec.ipynb b/NLP/Algorithms/Word2Vec/word2vec.ipynb deleted file mode 100644 index cb909d2..0000000 --- a/NLP/Algorithms/Word2Vec/word2vec.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30746,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### What is Word2Vec?\n\n**Word2Vec** is a technique to learn word embeddings using neural networks. The primary goal is to represent words in a continuous vector space where semantically similar words are mapped to nearby points. Word2Vec can be implemented using two main architectures:\n\n1. **Continuous Bag of Words (CBOW)**: Predicts the target word based on the context words (surrounding words).\n2. **Skip-gram**: Predicts the context words based on a given target word.\n\nIn this example, we'll focus on the Skip-gram approach, which is more commonly used in practice. The Skip-gram model tries to maximize the probability of context words given a target word.","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"code","source":"import numpy as np\n\nclass Word2Vec:\n def __init__(self, window_size=2, embedding_dim=10, learning_rate=0.01):\n # Initialize parameters\n self.window_size = window_size\n self.embedding_dim = embedding_dim\n self.learning_rate = learning_rate\n self.vocabulary = {}\n self.word_index = {}\n self.index_word = {}\n self.W1 = None\n self.W2 = None\n\n def tokenize(self, documents):\n # Tokenize documents and build vocabulary\n vocabulary = set()\n for doc in documents:\n words = doc.split()\n vocabulary.update(words)\n \n self.vocabulary = list(vocabulary)\n self.word_index = {word: idx for idx, word in enumerate(self.vocabulary)}\n self.index_word = {idx: word for idx, word in enumerate(self.vocabulary)}\n\n def generate_training_data(self, documents):\n # Generate training data for the Skip-gram model\n training_data = []\n for doc in documents:\n words = doc.split()\n for idx, word in enumerate(words):\n target_word = self.word_index[word]\n context = [self.word_index[words[i]] for i in range(max(0, idx - self.window_size), min(len(words), idx + self.window_size + 1)) if i != idx]\n for context_word in context:\n training_data.append((target_word, context_word))\n return training_data\n\n def train(self, documents, epochs=1000):\n # Tokenize the documents and generate training data\n self.tokenize(documents)\n training_data = self.generate_training_data(documents)\n \n # Initialize weight matrices with random values\n vocab_size = len(self.vocabulary)\n self.W1 = np.random.uniform(-1, 1, (vocab_size, self.embedding_dim))\n self.W2 = np.random.uniform(-1, 1, (self.embedding_dim, vocab_size))\n \n for epoch in range(epochs):\n loss = 0\n for target_word, context_word in training_data:\n # Forward pass\n h = self.W1[target_word] # Hidden layer representation of the target word\n u = np.dot(h, self.W2) # Output layer scores\n y_pred = self.softmax(u) # Predicted probabilities\n \n # Calculate error\n e = np.zeros(vocab_size)\n e[context_word] = 1\n error = y_pred - e\n \n # Backpropagation\n self.W1[target_word] -= self.learning_rate * np.dot(self.W2, error)\n self.W2 -= self.learning_rate * np.outer(h, error)\n \n # Calculate loss (cross-entropy)\n loss -= np.log(y_pred[context_word])\n \n if (epoch + 1) % 100 == 0:\n print(f'Epoch {epoch + 1}, Loss: {loss}')\n\n def softmax(self, x):\n # Softmax function to convert scores into probabilities\n e_x = np.exp(x - np.max(x))\n return e_x / e_x.sum(axis=0)\n\n def get_word_vector(self, word):\n # Retrieve the vector representation of a word\n return self.W1[self.word_index[word]]\n\n def get_vocabulary(self):\n # Retrieve the vocabulary\n return self.vocabulary","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:18:01.791652Z","iopub.execute_input":"2024-07-20T10:18:01.792119Z","iopub.status.idle":"2024-07-20T10:18:01.808809Z","shell.execute_reply.started":"2024-07-20T10:18:01.792084Z","shell.execute_reply":"2024-07-20T10:18:01.807756Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"# Example usage\nif __name__ == \"__main__\":\n # Basic example usage\n documents = [\n \"the cat sat on the mat\",\n \"the dog ate my homework\",\n \"the cat ate the dog food\"\n ]\n\n word2vec = Word2Vec()\n word2vec.train(documents)\n\n # Getting the word vector for 'cat'\n word_vector = word2vec.get_word_vector('cat')\n print(\"Vector for 'cat':\", word_vector)","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:14:57.090284Z","iopub.execute_input":"2024-07-20T10:14:57.091266Z","iopub.status.idle":"2024-07-20T10:14:58.587081Z","shell.execute_reply.started":"2024-07-20T10:14:57.091227Z","shell.execute_reply":"2024-07-20T10:14:58.586006Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"Epoch 100, Loss: 75.35100287508789\nEpoch 200, Loss: 72.30321253446704\nEpoch 300, Loss: 71.89275194982119\nEpoch 400, Loss: 71.8635105655411\nEpoch 500, Loss: 71.92973913785089\nEpoch 600, Loss: 72.0327548745859\nEpoch 700, Loss: 72.15655555085135\nEpoch 800, Loss: 72.29658835738203\nEpoch 900, Loss: 72.45216981302312\nEpoch 1000, Loss: 72.62419809127482\nVector for 'cat': [-0.07713173 -1.51443462 -0.67274315 0.31749278 0.37492043 -0.29362281\n 0.83514656 -0.73169726 -0.60048742 1.68112817]\n","output_type":"stream"}]},{"cell_type":"code","source":"# Additional example usage\nif __name__ == \"__main__\":\n # Sample documents\n documents = [\n \"I love programming in Python\",\n \"Machine learning is fun\",\n \"Python is a versatile language\",\n \"Learning new skills is always beneficial\"\n ]\n\n # Initialize and train the Word2Vec model\n word2vec = Word2Vec()\n word2vec.train(documents)\n\n # Print the vocabulary\n print(\"Vocabulary:\", word2vec.get_vocabulary())\n\n # Print the word vectors for each word in the vocabulary\n print(\"Word Vectors:\")\n for word in word2vec.get_vocabulary():\n vector = word2vec.get_word_vector(word)\n print(f\"Vector for '{word}':\", vector)\n\n # More example documents with mixed content\n more_documents = [\n \"the quick brown fox jumps over the lazy dog\",\n \"a journey of a thousand miles begins with a single step\",\n \"to be or not to be that is the question\",\n \"the rain in Spain stays mainly in the plain\",\n \"all human beings are born free and equal in dignity and rights\"\n ]\n\n # Initialize and train the Word2Vec model on new documents\n word2vec_more = Word2Vec()\n word2vec_more.train(more_documents)\n\n # Print the word vectors for selected words\n print(\"\\nWord Vectors for new documents:\")\n for word in ['quick', 'journey', 'be', 'rain', 'human']:\n vector = word2vec_more.get_word_vector(word)\n print(f\"Vector for '{word}':\", vector)","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:18:04.093039Z","iopub.execute_input":"2024-07-20T10:18:04.093849Z","iopub.status.idle":"2024-07-20T10:18:11.208990Z","shell.execute_reply.started":"2024-07-20T10:18:04.093806Z","shell.execute_reply":"2024-07-20T10:18:11.207981Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"Epoch 100, Loss: 87.29077775759023\nEpoch 200, Loss: 79.18780389421288\nEpoch 300, Loss: 77.94910795980299\nEpoch 400, Loss: 77.66245836714485\nEpoch 500, Loss: 77.60853401053721\nEpoch 600, Loss: 77.63645020161269\nEpoch 700, Loss: 77.70072895043553\nEpoch 800, Loss: 77.78464829390958\nEpoch 900, Loss: 77.88140561618562\nEpoch 1000, Loss: 77.98820209880577\nVocabulary: ['love', 'learning', 'Learning', 'a', 'beneficial', 'new', 'Machine', 'fun', 'always', 'I', 'language', 'Python', 'programming', 'skills', 'versatile', 'is', 'in']\nWord Vectors:\nVector for 'love': [ 1.25611249 0.72399226 0.73932742 -1.1793396 -0.03000625 0.76406502\n -1.51633466 -0.21015759 -0.5542326 -0.18235466]\nVector for 'learning': [-0.05385896 -1.602021 1.4943953 0.44231527 -0.07299037 -0.74754454\n -1.3903911 1.00685072 0.4544704 -1.36141874]\nVector for 'Learning': [-1.74465271 -0.68294311 -1.5425367 -0.05822687 -0.26804989 -0.1100379\n 0.64812036 0.93995388 0.00906527 0.51813513]\nVector for 'a': [ 0.42309774 0.36851908 -1.21744857 0.03453108 0.56061874 0.66453021\n 1.48116814 -1.41757756 1.6581559 1.24310819]\nVector for 'beneficial': [-0.64162578 -0.05537255 1.84063196 1.11458998 -0.18536477 -0.51397477\n 1.1567393 -0.08573849 1.02205477 -0.10391583]\nVector for 'new': [-0.84236249 -1.1861667 0.47188798 -0.90436565 -1.54968604 0.26117848\n 0.31966193 0.60843338 1.44536674 0.45516902]\nVector for 'Machine': [ 0.56576103 -0.22504546 0.37463336 2.0133637 -0.98459966 -0.09227832\n 0.09937559 0.87758714 0.33328967 0.62105041]\nVector for 'fun': [ 0.74045829 -0.10269314 0.38566757 1.49587936 -0.46728908 0.6369226\n 0.35471937 1.21234412 0.62831945 1.05840841]\nVector for 'always': [-0.50630359 -1.0742406 -1.41055437 0.88033565 -1.75850862 0.06157641\n -0.07544321 -0.34542803 1.01462117 -0.42767684]\nVector for 'I': [ 0.08341753 0.90589692 -0.11179104 0.21613275 -1.58854584 -1.02474237\n -1.15777817 -1.53751946 -0.30385788 1.19445792]\nVector for 'language': [-0.14495823 0.89041417 -0.2826522 0.55803733 1.67530985 -1.50716127\n 0.44839956 -0.75002047 0.19454055 0.41988745]\nVector for 'Python': [ 1.3746919 -0.69313117 1.20280712 0.33296481 1.0498198 -0.30617252\n -0.68207893 -1.63528049 0.11746197 0.18804705]\nVector for 'programming': [ 0.88953518 0.3960943 0.08324377 -0.81845185 0.45232069 0.50849579\n 0.31470319 -0.28788132 -2.10671769 0.9837999 ]\nVector for 'skills': [-0.61114903 -1.23867723 0.67501036 -0.20070013 0.45251135 1.06565594\n 1.98702809 0.97876004 0.29505154 0.0113592 ]\nVector for 'versatile': [ 0.6889747 -0.8286536 -0.93369097 0.45645706 1.77826296 -0.11065061\n -0.17642369 -0.38036503 1.33092187 -0.6485383 ]\nVector for 'is': [-1.42321806 1.03466665 -0.45712698 1.15218139 0.04484756 -0.21603108\n 0.33568069 0.13245389 -0.43471618 -0.76258933]\nVector for 'in': [ 1.22477025 0.09302424 0.48840564 0.34210232 -1.54938386 0.06713506\n 0.48047235 -1.5520969 -1.28586721 0.29676856]\nEpoch 100, Loss: 295.9373094225889\nEpoch 200, Loss: 274.5912114033107\nEpoch 300, Loss: 271.589257628458\nEpoch 400, Loss: 270.94384683000214\nEpoch 500, Loss: 270.887818905955\nEpoch 600, Loss: 271.04736753192935\nEpoch 700, Loss: 271.3131621354528\nEpoch 800, Loss: 271.648120274235\nEpoch 900, Loss: 272.0401492641082\nEpoch 1000, Loss: 272.4875456710171\n\nWord Vectors for new documents:\nVector for 'quick': [ 0.84963171 -0.40584142 0.17137242 -1.8239235 1.30300479 1.18797836\n 0.83016292 0.34557914 -0.67271966 -1.01562793]\nVector for 'journey': [-0.33640099 0.42400908 -1.28853498 1.11235157 -0.68733637 -1.4828937\n 1.16960258 1.64052887 -0.04964284 0.26274727]\nVector for 'be': [-1.29584666 1.08229277 1.21521283 -0.76696754 0.70692083 0.58232736\n -0.28705896 -1.90282741 0.43638555 0.51532593]\nVector for 'rain': [-1.75428039 -1.2910116 -1.03591684 -0.72590607 -0.19088637 1.23074524\n -0.7232419 0.01864839 1.13582975 -0.6059139 ]\nVector for 'human': [ 0.89800051 0.99213515 -1.3657362 -0.33661478 0.05095791 1.71125124\n -0.48803223 -0.31073697 0.51263003 1.35208204]\n","output_type":"stream"}]},{"cell_type":"markdown","source":"#### Explanation of the Code\n\n1. **Initialization**:\n - `window_size`: Defines the size of the context window around the target word.\n - `embedding_dim`: Dimension of the word vectors (embedding space).\n - `learning_rate`: Rate at which weights are updated.\n\n2. **Tokenization**:\n - The `tokenize` method creates a vocabulary from the documents and builds mappings between words and their indices.\n\n3. **Generate Training Data**:\n - The `generate_training_data` method creates pairs of target words and context words based on the window size.\n\n4. **Training**:\n - The `train` method initializes the weight matrices and updates them using gradient descent.\n - For each word-context pair, it computes the hidden layer representation, predicts context probabilities, calculates the error, and updates the weights.\n\n5. **Softmax Function**:\n - The `softmax` function converts the output layer scores into probabilities, which are used to compute the error and update the weights.\n\n6. **Retrieve Word Vector**:\n - The `get_word_vector` method retrieves the embedding of a specific word.","metadata":{}}]} \ No newline at end of file From ae617b17cf3b8f530dd063fbb507de7215228e22 Mon Sep 17 00:00:00 2001 From: UTSAV SINGHAL Date: Wed, 24 Jul 2024 11:41:14 +0530 Subject: [PATCH 4/6] Delete NLP/Algorithms/TF-IDF directory --- NLP/Algorithms/TF-IDF/tf-idf.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 NLP/Algorithms/TF-IDF/tf-idf.ipynb diff --git a/NLP/Algorithms/TF-IDF/tf-idf.ipynb b/NLP/Algorithms/TF-IDF/tf-idf.ipynb deleted file mode 100644 index e0a3dfa..0000000 --- a/NLP/Algorithms/TF-IDF/tf-idf.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30746,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"#### The TF-IDF (Term Frequency-Inverse Document Frequency) algorithm is used to convert a collection of text documents into a matrix of TF-IDF features. It is commonly used in text mining and information retrieval to reflect the importance of a word in a document relative to a collection of documents.","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"code","source":"import math\nfrom collections import Counter\n\nclass TFIDF:\n def __init__(self):\n self.vocabulary = {} # Vocabulary to store word indices\n self.idf_values = {} # IDF values for words\n\n def fit(self, documents):\n \"\"\"\n Compute IDF values based on the provided documents.\n \n Args:\n documents (list of str): List of documents where each document is a string.\n \"\"\"\n doc_count = len(documents)\n term_doc_count = Counter() # To count the number of documents containing each word\n\n # Count occurrences of words in documents\n for doc in documents:\n words = set(doc.split()) # Unique words in the current document\n for word in words:\n term_doc_count[word] += 1\n\n # Compute IDF values\n self.idf_values = {\n word: math.log(doc_count / (count + 1)) # +1 to avoid division by zero\n for word, count in term_doc_count.items()\n }\n\n # Build vocabulary\n self.vocabulary = {word: idx for idx, word in enumerate(self.idf_values.keys())}\n\n def transform(self, documents):\n \"\"\"\n Transform documents into TF-IDF representation.\n\n Args:\n documents (list of str): List of documents where each document is a string.\n \n Returns:\n list of list of float: TF-IDF matrix where each row corresponds to a document.\n \"\"\"\n rows = []\n for doc in documents:\n words = doc.split()\n word_count = Counter(words)\n doc_length = len(words)\n row = [0] * len(self.vocabulary)\n\n for word, count in word_count.items():\n if word in self.vocabulary:\n tf = count / doc_length\n idf = self.idf_values[word]\n index = self.vocabulary[word]\n row[index] = tf * idf\n rows.append(row)\n return rows\n\n def fit_transform(self, documents):\n \"\"\"\n Compute IDF values and transform documents into TF-IDF representation.\n\n Args:\n documents (list of str): List of documents where each document is a string.\n\n Returns:\n list of list of float: TF-IDF matrix where each row corresponds to a document.\n \"\"\"\n self.fit(documents)\n return self.transform(documents)","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:08:08.207148Z","iopub.execute_input":"2024-07-20T10:08:08.207645Z","iopub.status.idle":"2024-07-20T10:08:08.222510Z","shell.execute_reply.started":"2024-07-20T10:08:08.207605Z","shell.execute_reply":"2024-07-20T10:08:08.221404Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"# Example usage\nif __name__ == \"__main__\":\n documents = [\n \"the cat sat on the mat\",\n \"the dog ate my homework\",\n \"the cat ate the dog food\"\n ]\n\n tfidf = TFIDF()\n tfidf_matrix = tfidf.fit_transform(documents)\n for i, row in enumerate(tfidf_matrix):\n print(f\"Document {i}: {row}\")","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:08:10.692831Z","iopub.execute_input":"2024-07-20T10:08:10.693205Z","iopub.status.idle":"2024-07-20T10:08:10.699967Z","shell.execute_reply.started":"2024-07-20T10:08:10.693178Z","shell.execute_reply":"2024-07-20T10:08:10.698625Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"Document 0: [0.0, -0.09589402415059363, 0.06757751801802739, 0.06757751801802739, 0.06757751801802739, 0, 0, 0, 0, 0]\nDocument 1: [0, -0.05753641449035618, 0, 0, 0, 0.08109302162163289, 0.08109302162163289, 0.0, 0.0, 0]\nDocument 2: [0.0, -0.09589402415059363, 0, 0, 0, 0, 0, 0.0, 0.0, 0.06757751801802739]\n","output_type":"stream"}]},{"cell_type":"code","source":"# Additional example usage\nif __name__ == \"__main__\":\n # Sample documents\n documents = [\n \"I love programming in Python\",\n \"Machine learning is fun\",\n \"Python is a versatile language\",\n \"Learning new skills is always beneficial\"\n ]\n\n # Initialize the TF-IDF model\n tfidf = TFIDF()\n \n # Fit the model and transform the documents\n tfidf_matrix = tfidf.fit_transform(documents)\n \n # Print the vocabulary\n print(\"Vocabulary:\", tfidf.vocabulary)\n \n # Print the TF-IDF representation\n print(\"TF-IDF Representation:\")\n for i, vector in enumerate(tfidf_matrix):\n print(f\"Document {i + 1}: {vector}\")\n\n # More example documents with mixed content\n more_documents = [\n \"the quick brown fox jumps over the lazy dog\",\n \"a journey of a thousand miles begins with a single step\",\n \"to be or not to be that is the question\",\n \"the rain in Spain stays mainly in the plain\",\n \"all human beings are born free and equal in dignity and rights\"\n ]\n\n # Fit the model and transform the new set of documents\n tfidf_more = TFIDF()\n tfidf_matrix_more = tfidf_more.fit_transform(more_documents)\n \n # Print the vocabulary for the new documents\n print(\"\\nVocabulary for new documents:\", tfidf_more.vocabulary)\n \n # Print the TF-IDF representation for the new documents\n print(\"TF-IDF Representation for new documents:\")\n for i, vector in enumerate(tfidf_matrix_more):\n print(f\"Document {i + 1}: {vector}\")","metadata":{"execution":{"iopub.status.busy":"2024-07-20T10:09:51.105985Z","iopub.execute_input":"2024-07-20T10:09:51.107160Z","iopub.status.idle":"2024-07-20T10:09:51.118181Z","shell.execute_reply.started":"2024-07-20T10:09:51.107108Z","shell.execute_reply":"2024-07-20T10:09:51.116972Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"Vocabulary: {'love': 0, 'I': 1, 'Python': 2, 'programming': 3, 'in': 4, 'learning': 5, 'fun': 6, 'Machine': 7, 'is': 8, 'a': 9, 'language': 10, 'versatile': 11, 'Learning': 12, 'beneficial': 13, 'new': 14, 'always': 15, 'skills': 16}\nTF-IDF Representation:\nDocument 1: [0.13862943611198905, 0.13862943611198905, 0.05753641449035617, 0.13862943611198905, 0.13862943611198905, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 2: [0, 0, 0, 0, 0, 0.17328679513998632, 0.17328679513998632, 0.17328679513998632, 0.0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 3: [0, 0, 0.05753641449035617, 0, 0, 0, 0, 0, 0.0, 0.13862943611198905, 0.13862943611198905, 0.13862943611198905, 0, 0, 0, 0, 0]\nDocument 4: [0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0, 0.11552453009332421, 0.11552453009332421, 0.11552453009332421, 0.11552453009332421, 0.11552453009332421]\n\nVocabulary for new documents: {'brown': 0, 'fox': 1, 'quick': 2, 'over': 3, 'the': 4, 'lazy': 5, 'dog': 6, 'jumps': 7, 'thousand': 8, 'journey': 9, 'single': 10, 'a': 11, 'step': 12, 'with': 13, 'of': 14, 'miles': 15, 'begins': 16, 'to': 17, 'or': 18, 'question': 19, 'not': 20, 'be': 21, 'that': 22, 'is': 23, 'Spain': 24, 'rain': 25, 'mainly': 26, 'plain': 27, 'stays': 28, 'in': 29, 'human': 30, 'and': 31, 'all': 32, 'born': 33, 'equal': 34, 'dignity': 35, 'are': 36, 'rights': 37, 'beings': 38, 'free': 39}\nTF-IDF Representation for new documents:\nDocument 1: [0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.049587455847602165, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 2: [0, 0, 0, 0, 0, 0, 0, 0, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0.249897472329315, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0.08329915744310501, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 3: [0, 0, 0, 0, 0.02231435513142098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.18325814637483104, 0.09162907318741552, 0.09162907318741552, 0.09162907318741552, 0.18325814637483104, 0.09162907318741552, 0.09162907318741552, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 4: [0, 0, 0, 0, 0.049587455847602165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.10181008131935056, 0.11351680528133126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nDocument 5: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.04256880198049923, 0.07635756098951292, 0.15271512197902584, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292, 0.07635756098951292]\n","output_type":"stream"}]},{"cell_type":"markdown","source":"#### Explanation:\n\n1. **Initialization**:\n - `self.vocabulary`: Dictionary to store the mapping of words to their indices in the TF-IDF matrix.\n - `self.idf_values`: Dictionary to store the IDF (Inverse Document Frequency) values for each word.\n\n2. **`fit` Method**:\n - **Input**: List of documents.\n - **Purpose**: Calculate the IDF values for all unique words in the corpus.\n - **Steps**:\n 1. Count the number of documents containing each word.\n 2. Compute the IDF for each word using the formula:\n $$\n \\text{IDF}(word) = \\log \\left(\\frac{\\text{Total number of documents}}{\\text{Number of documents containing the word} + 1}\\right)\n $$\n Adding 1 avoids division by zero.\n 3. Build the vocabulary with word-to-index mapping.\n\n3. **`transform` Method**:\n - **Input**: List of documents.\n - **Purpose**: Convert each document into a TF-IDF representation.\n - **Steps**:\n 1. Compute Term Frequency (TF) for each word in the document:\n $$\n \\text{TF} = \\frac{\\text{Count of the word}}{\\text{Total number of words in the document}}\n $$\n 2. Compute the TF-IDF value:\n $$\n \\text{TF-IDF} = \\text{TF} \\times \\text{IDF}\n $$\n 3. Store the TF-IDF values in a matrix where each row corresponds to a document.\n\n4. **`fit_transform` Method**:\n - **Purpose**: Perform both fitting (computing IDF values) and transforming (converting documents to TF-IDF representation) in one step.","metadata":{}}]} \ No newline at end of file From 97045fdbe8fa2771803c540e46d1c3aefcab0305 Mon Sep 17 00:00:00 2001 From: UTSAV SINGHAL Date: Wed, 24 Jul 2024 11:47:08 +0530 Subject: [PATCH 5/6] Add files via upload --- NLP/Algorithms/TF-IDF/README.md | 82 ++++++++++++++++++ NLP/Algorithms/TF-IDF/tf_idf.py | 117 ++++++++++++++++++++++++++ NLP/Algorithms/Word2Vec/README.md | 105 +++++++++++++++++++++++ NLP/Algorithms/Word2Vec/word2vec.py | 126 ++++++++++++++++++++++++++++ 4 files changed, 430 insertions(+) create mode 100644 NLP/Algorithms/TF-IDF/README.md create mode 100644 NLP/Algorithms/TF-IDF/tf_idf.py create mode 100644 NLP/Algorithms/Word2Vec/README.md create mode 100644 NLP/Algorithms/Word2Vec/word2vec.py diff --git a/NLP/Algorithms/TF-IDF/README.md b/NLP/Algorithms/TF-IDF/README.md new file mode 100644 index 0000000..f85e575 --- /dev/null +++ b/NLP/Algorithms/TF-IDF/README.md @@ -0,0 +1,82 @@ +# TF-IDF Implementation + +## Introduction + +The `TFIDF` class converts a collection of documents into their respective TF-IDF (Term Frequency-Inverse Document Frequency) representations. TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus). + +## Table of Contents + +1. [Attributes](#attributes) +2. [Methods](#methods) + - [fit Method](#fit-method) + - [transform Method](#transform-method) + - [fit_transform Method](#fit_transform-method) +3. [Explanation of the Code](#explanation-of-the-code) +4. [References](#references) + +## Attributes + +The `TFIDF` class is initialized with two main attributes: + +- **`self.vocabulary`**: A dictionary that maps words to their indices in the TF-IDF matrix. +- **`self.idf_values`**: A dictionary that stores the IDF (Inverse Document Frequency) values for each word. + +## Methods + +### fit Method + +#### Input + +- **`documents`** (list of str): List of documents where each document is a string. + +#### Purpose + +Calculate the IDF values for all unique words in the corpus. + +#### Steps + +1. **Count Document Occurrences**: Determine how many documents contain each word. +2. **Compute IDF**: Calculate the importance of each word across all documents. Higher values indicate the word is more unique to fewer documents. +3. **Build Vocabulary**: Create a mapping of words to unique indexes. + +### transform Method + +#### Input + +- **`documents`** (list of str): A list where each entry is a document in the form of a string. + +#### Purpose + +Convert each document into a numerical representation that shows the importance of each word. + +#### Steps + +1. **Compute Term Frequency (TF)**: Determine how often each word appears in a document relative to the total number of words in that document. +2. **Compute TF-IDF**: Multiply the term frequency of each word by its IDF to get a measure of its relevance in each document. +3. **Store Values**: Save these numerical values in a matrix where each row represents a document. + +### fit_transform Method + +#### Purpose + +Perform both fitting (computing IDF values) and transforming (converting documents to TF-IDF representation) in one step. + +## Explanation of the Code + +The `TFIDF` class includes methods for fitting the model to the data, transforming new data into the TF-IDF representation, and combining these steps. Here's a breakdown of the primary methods: + +1. **`fit` Method**: Calculates IDF values for all unique words in the corpus. It counts the number of documents containing each word and computes the IDF. The vocabulary is built with a word-to-index mapping. + +2. **`transform` Method**: Converts each document into a TF-IDF representation. It computes Term Frequency (TF) for each word in the document, calculates TF-IDF by multiplying TF with IDF, and stores these values in a matrix where each row corresponds to a document. + +3. **`fit_transform` Method**: Combines the fitting and transforming steps into a single method for efficient processing of documents. + +## References + +1. [TF-IDF - Wikipedia](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) +2. [Understanding TF-IDF](https://towardsdatascience.com/understanding-tf-idf-a-traditional-approach-to-feature-extraction-in-nlp-a5bfbe04723f) +3. [Scikit-learn: TF-IDF](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) + +--- + +This document provides a clear and structured explanation of the TF-IDF algorithm, including its attributes, methods, and overall functionality. \ No newline at end of file diff --git a/NLP/Algorithms/TF-IDF/tf_idf.py b/NLP/Algorithms/TF-IDF/tf_idf.py new file mode 100644 index 0000000..8541d3b --- /dev/null +++ b/NLP/Algorithms/TF-IDF/tf_idf.py @@ -0,0 +1,117 @@ +import math +from collections import Counter + +class TFIDF: + def __init__(self): + self.vocabulary = {} # Vocabulary to store word indices + self.idf_values = {} # IDF values for words + + def fit(self, documents): + """ + Compute IDF values based on the provided documents. + + Args: + documents (list of str): List of documents where each document is a string. + """ + doc_count = len(documents) + term_doc_count = Counter() # To count the number of documents containing each word + + # Count occurrences of words in documents + for doc in documents: + words = set(doc.split()) # Unique words in the current document + for word in words: + term_doc_count[word] += 1 + + # Compute IDF values + self.idf_values = { + word: math.log(doc_count / (count + 1)) # +1 to avoid division by zero + for word, count in term_doc_count.items() + } + + # Build vocabulary + self.vocabulary = {word: idx for idx, word in enumerate(self.idf_values.keys())} + + def transform(self, documents): + """ + Transform documents into TF-IDF representation. + + Args: + documents (list of str): List of documents where each document is a string. + + Returns: + list of list of float: TF-IDF matrix where each row corresponds to a document. + """ + rows = [] + for doc in documents: + words = doc.split() + word_count = Counter(words) + doc_length = len(words) + row = [0] * len(self.vocabulary) + + for word, count in word_count.items(): + if word in self.vocabulary: + tf = count / doc_length + idf = self.idf_values[word] + index = self.vocabulary[word] + row[index] = tf * idf + rows.append(row) + return rows + + def fit_transform(self, documents): + """ + Compute IDF values and transform documents into TF-IDF representation. + + Args: + documents (list of str): List of documents where each document is a string. + + Returns: + list of list of float: TF-IDF matrix where each row corresponds to a document. + """ + self.fit(documents) + return self.transform(documents) +# Example usage +if __name__ == "__main__": + documents = [ + "the cat sat on the mat", + "the dog ate my homework", + "the cat ate the dog food", + "I love programming in Python", + "Machine learning is fun", + "Python is a versatile language", + "Learning new skills is always beneficial" + ] + + # Initialize the TF-IDF model + tfidf = TFIDF() + + # Fit the model and transform the documents + tfidf_matrix = tfidf.fit_transform(documents) + + # Print the vocabulary + print("Vocabulary:", tfidf.vocabulary) + + # Print the TF-IDF representation + print("TF-IDF Representation:") + for i, vector in enumerate(tfidf_matrix): + print(f"Document {i + 1}: {vector}") + + # More example documents with mixed content + more_documents = [ + "the quick brown fox jumps over the lazy dog", + "a journey of a thousand miles begins with a single step", + "to be or not to be that is the question", + "the rain in Spain stays mainly in the plain", + "all human beings are born free and equal in dignity and rights" + ] + + # Fit the model and transform the new set of documents + tfidf_more = TFIDF() + tfidf_matrix_more = tfidf_more.fit_transform(more_documents) + + # Print the vocabulary for the new documents + print("\nVocabulary for new documents:", tfidf_more.vocabulary) + + # Print the TF-IDF representation for the new documents + print("TF-IDF Representation for new documents:") + for i, vector in enumerate(tfidf_matrix_more): + print(f"Document {i + 1}: {vector}") \ No newline at end of file diff --git a/NLP/Algorithms/Word2Vec/README.md b/NLP/Algorithms/Word2Vec/README.md new file mode 100644 index 0000000..9199176 --- /dev/null +++ b/NLP/Algorithms/Word2Vec/README.md @@ -0,0 +1,105 @@ +# Word2Vec Skip-gram Implementation + +## Introduction + +Word2Vec is a technique to learn word embeddings using neural networks. The primary goal is to represent words in a continuous vector space where semantically similar words are mapped to nearby points. Word2Vec can be implemented using two main architectures: + +1. **Continuous Bag of Words (CBOW)**: Predicts the target word based on the context words (surrounding words). +2. **Skip-gram**: Predicts the context words based on a given target word. + +In this example, we focus on the Skip-gram approach, which is more commonly used in practice. The Skip-gram model tries to maximize the probability of context words given a target word. + +## Table of Contents + +1. [Installation](#installation) +2. [Usage](#usage) + - [Initialization](#initialization) + - [Tokenization](#tokenization) + - [Generate Training Data](#generate-training-data) + - [Training](#training) + - [Retrieve Word Vector](#retrieve-word-vector) +3. [Explanation of the Code](#explanation-of-the-code) +4. [References](#references) + +## Installation + +Ensure you have Python installed. You can install the necessary dependencies using pip: + +```sh +pip install numpy +``` + +## Usage + +### Initialization + +Define the parameters for the Word2Vec model: + +- `window_size`: Defines the size of the context window around the target word. +- `embedding_dim`: Dimension of the word vectors (embedding space). +- `learning_rate`: Rate at which weights are updated. + +### Tokenization + +The `tokenize` method creates a vocabulary from the documents and builds mappings between words and their indices. + +### Generate Training Data + +The `generate_training_data` method creates pairs of target words and context words based on the window size. + +### Training + +The `train` method initializes the weight matrices and updates them using gradient descent. + +For each word-context pair, it computes the hidden layer representation, predicts context probabilities, calculates the error, and updates the weights. + +### Retrieve Word Vector + +The `get_word_vector` method retrieves the embedding of a specific word. + +## Explanation of the Code + +### Initialization + +- **Parameters**: + - `window_size`: Size of the context window around the target word. + - `embedding_dim`: Dimension of the word vectors (embedding space). + - `learning_rate`: Rate at which weights are updated. + +### Tokenization + +- The `tokenize` method creates a vocabulary from the documents. +- Builds mappings between words and their indices. + +### Generate Training Data + +- The `generate_training_data` method creates pairs of target words and context words based on the window size. + +### Training + +- The `train` method initializes the weight matrices. +- Updates the weights using gradient descent. +- For each word-context pair: + - Computes the hidden layer representation. + - Predicts context probabilities. + - Calculates the error. + - Updates the weights. + +### Softmax Function + +- The `softmax` function converts the output layer scores into probabilities. +- Used to compute the error and update the weights. + +### Retrieve Word Vector + +- The `get_word_vector` method retrieves the embedding of a specific word. + +## References + +1. [Word2Vec - Google](https://code.google.com/archive/p/word2vec/) +2. [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/abs/1301.3781) +3. [Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/abs/1310.4546) + +--- + +This README file provides a comprehensive overview of the Word2Vec Skip-gram implementation, including installation instructions, usage details, and an explanation of the code. \ No newline at end of file diff --git a/NLP/Algorithms/Word2Vec/word2vec.py b/NLP/Algorithms/Word2Vec/word2vec.py new file mode 100644 index 0000000..f8988f6 --- /dev/null +++ b/NLP/Algorithms/Word2Vec/word2vec.py @@ -0,0 +1,126 @@ +import numpy as np + +class Word2Vec: + def __init__(self, window_size=2, embedding_dim=10, learning_rate=0.01): + # Initialize parameters + self.window_size = window_size + self.embedding_dim = embedding_dim + self.learning_rate = learning_rate + self.vocabulary = {} + self.word_index = {} + self.index_word = {} + self.W1 = None + self.W2 = None + + def tokenize(self, documents): + # Tokenize documents and build vocabulary + vocabulary = set() + for doc in documents: + words = doc.split() + vocabulary.update(words) + + self.vocabulary = list(vocabulary) + self.word_index = {word: idx for idx, word in enumerate(self.vocabulary)} + self.index_word = {idx: word for idx, word in enumerate(self.vocabulary)} + + def generate_training_data(self, documents): + # Generate training data for the Skip-gram model + training_data = [] + for doc in documents: + words = doc.split() + for idx, word in enumerate(words): + target_word = self.word_index[word] + context = [self.word_index[words[i]] for i in range(max(0, idx - self.window_size), min(len(words), idx + self.window_size + 1)) if i != idx] + for context_word in context: + training_data.append((target_word, context_word)) + return training_data + + def train(self, documents, epochs=1000): + # Tokenize the documents and generate training data + self.tokenize(documents) + training_data = self.generate_training_data(documents) + + # Initialize weight matrices with random values + vocab_size = len(self.vocabulary) + self.W1 = np.random.uniform(-1, 1, (vocab_size, self.embedding_dim)) + self.W2 = np.random.uniform(-1, 1, (self.embedding_dim, vocab_size)) + + for epoch in range(epochs): + loss = 0 + for target_word, context_word in training_data: + # Forward pass + h = self.W1[target_word] # Hidden layer representation of the target word + u = np.dot(h, self.W2) # Output layer scores + y_pred = self.softmax(u) # Predicted probabilities + + # Calculate error + e = np.zeros(vocab_size) + e[context_word] = 1 + error = y_pred - e + + # Backpropagation + self.W1[target_word] -= self.learning_rate * np.dot(self.W2, error) + self.W2 -= self.learning_rate * np.outer(h, error) + + # Calculate loss (cross-entropy) + loss -= np.log(y_pred[context_word]) + + if (epoch + 1) % 100 == 0: + print(f'Epoch {epoch + 1}, Loss: {loss}') + + def softmax(self, x): + # Softmax function to convert scores into probabilities + e_x = np.exp(x - np.max(x)) + return e_x / e_x.sum(axis=0) + + def get_word_vector(self, word): + # Retrieve the vector representation of a word + return self.W1[self.word_index[word]] + + def get_vocabulary(self): + # Retrieve the vocabulary + return self.vocabulary +# Example usage +if __name__ == "__main__": + # Basic example usage + documents = [ + "the cat sat on the mat", + "the dog ate my homework", + "the cat ate the dog food", + "I love programming in Python", + "Machine learning is fun", + "Python is a versatile language", + "Learning new skills is always beneficial" + ] + + # Initialize and train the Word2Vec model + word2vec = Word2Vec() + word2vec.train(documents) + + # Print the vocabulary + print("Vocabulary:", word2vec.get_vocabulary()) + + # Print the word vectors for each word in the vocabulary + print("Word Vectors:") + for word in word2vec.get_vocabulary(): + vector = word2vec.get_word_vector(word) + print(f"Vector for '{word}':", vector) + + # More example documents with mixed content + more_documents = [ + "the quick brown fox jumps over the lazy dog", + "a journey of a thousand miles begins with a single step", + "to be or not to be that is the question", + "the rain in Spain stays mainly in the plain", + "all human beings are born free and equal in dignity and rights" + ] + + # Initialize and train the Word2Vec model on new documents + word2vec_more = Word2Vec() + word2vec_more.train(more_documents) + + # Print the word vectors for selected words + print("\nWord Vectors for new documents:") + for word in ['quick', 'journey', 'be', 'rain', 'human']: + vector = word2vec_more.get_word_vector(word) + print(f"Vector for '{word}':", vector) \ No newline at end of file From 45eb22fef1a08c812ade973f3666d179a4a0a200 Mon Sep 17 00:00:00 2001 From: UTSAV SINGHAL Date: Wed, 24 Jul 2024 11:47:52 +0530 Subject: [PATCH 6/6] Update README.md --- NLP/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NLP/README.md b/NLP/README.md index 80ff8ef..65d36c1 100644 --- a/NLP/README.md +++ b/NLP/README.md @@ -6,7 +6,7 @@ | S.No | Algorithm | S.No. | Algorithm | S.No. | Algorithm | |-------|-----------|-------|-----------|-------|-----------| -| 1 | [Bag of Words](./Algorithms/BagOfWords) | 2 | [TF-IDF](./Algorithms/TF-IDF/tf-idf.ipynb) | 3 | [Word2Vec](./Algorithms/Word2Vec/word2vec.ipynb) | +| 1 | [Bag of Words](./Algorithms/BagOfWords) | 2 | [TF-IDF](./Algorithms/TF-IDF) | 3 | [Word2Vec](./Algorithms/Word2Vec) | | 4 | | 5 | | 6 | | ## Available Documentations