-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
commit
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-13-deepconn.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P330133%20%7C%20DeepCoNN%20on%20Amazon%20Music%20Instruments%20in%20PyTorch.ipynb","timestamp":1644610385849},{"file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","timestamp":1640329037065}],"collapsed_sections":[],"mount_file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","authorship_tag":"ABX9TyM2tK6zA8IMzwgFJjeAgFJH"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# DeepCoNN on Amazon Music Instruments in PyTorch"],"metadata":{"id":"2VZ8fr0qMt5t"}},{"cell_type":"code","source":["!wget -q --show-progress https://github.com/RecoHut-Datasets/amazon_music_instruments/raw/v1/Musical_Instruments_5.json"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"L60wRz3KLutF","executionInfo":{"status":"ok","timestamp":1641537515379,"user_tz":-330,"elapsed":1220,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"2e592bdd-5451-4999-9b69-86d8fefaeaad"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\r Musical_I 0%[ ] 0 --.-KB/s \rMusical_Instruments 100%[===================>] 7.10M --.-KB/s in 0.03s \n"]}]},{"cell_type":"code","source":["!wget -q --show-progress https://github.com/huangjunheng/recommendation_model/raw/master/DeepCoNN/data/embedding_data/stopwords.txt\n","!wget -q --show-progress https://github.com/huangjunheng/recommendation_model/raw/master/DeepCoNN/data/embedding_data/punctuations.txt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pgm_ixhCRjdt","executionInfo":{"status":"ok","timestamp":1641537766384,"user_tz":-330,"elapsed":1239,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"219fb486-f23d-4f65-88ca-bc72002fb508"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["stopwords.txt 100%[===================>] 621 --.-KB/s in 0s \n","punctuations.txt 100%[===================>] 57 --.-KB/s in 0s \n"]}]},{"cell_type":"code","source":["!wget -q --show-progress -c \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\"\n","!gzip -d GoogleNews-vectors-negative300.bin.gz"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vcYWhWrlR0Na","executionInfo":{"status":"ok","timestamp":1641537933311,"user_tz":-330,"elapsed":95967,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"0ca55c5a-3290-4965-e73f-dfb1904343bf"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["GoogleNews-vectors- 100%[===================>] 1.53G 48.0MB/s in 29s \n"]}]},{"cell_type":"markdown","source":["## Preprocessing"],"metadata":{"id":"_tqBBPOQQySy"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import torch\n","import nltk\n","from sklearn.model_selection import train_test_split\n","from gensim.models import KeyedVectors\n","import pickle\n","from gensim.models.keyedvectors import Word2VecKeyedVectors"],"metadata":{"id":"vt6fSpGqQ2La"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["nltk.download('wordnet')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2LL5FMAARKw6","executionInfo":{"status":"ok","timestamp":1641538034166,"user_tz":-330,"elapsed":1545,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"d2529bc2-b793-4cf7-b3cb-ad70bd0285a8"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[nltk_data] Downloading package wordnet to /root/nltk_data...\n","[nltk_data] Unzipping corpora/wordnet.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["path = '/content/'\n","PAD_WORD = '<pad>'\n","PAD_WORD_ID = 3000000\n","WORD_EMBEDDINF_SIZE = 300"],"metadata":{"id":"fhuePKLCQ32f"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def process_raw_data(in_path, out_path):\n"," df = pd.read_json(in_path, lines=True)\n"," df = df[['reviewerID', 'asin', 'reviewText', 'overall']]\n"," df.columns = ['userID', 'itemID', 'review', 'rating']\n","\n"," df['userID'] = df.groupby(df['userID']).ngroup()\n"," df['itemID'] = df.groupby(df['itemID']).ngroup()\n","\n"," with open('stopwords.txt') as f:\n"," stop_words = set(f.read().splitlines())\n","\n"," with open('punctuations.txt') as f:\n"," punctuations = set(f.read().splitlines())\n","\n"," def clean_review(review):\n"," lemmatizer = nltk.WordNetLemmatizer()\n"," review = review.lower()\n"," for p in punctuations:\n"," review = review.replace(p, ' ')\n"," tokens = review.split()\n"," tokens = [word for word in tokens if word not in stop_words]\n"," # 词形归并 词干提取\n"," tokens = [lemmatizer.lemmatize(word) for word in tokens]\n"," return ' '.join(tokens)\n","\n"," df['review'] = df['review'].apply(clean_review)\n"," df.to_json(out_path, orient='records', lines=True)\n","\n","\n","def get_word_vec():\n"," # 加载预训练词嵌入模型\n"," in_path = 'GoogleNews-vectors-negative300.bin'\n"," out_path = path + 'embedding_weight.pt'\n"," word_vec = KeyedVectors.load_word2vec_format(in_path, binary=True)\n"," word_vec.add([PAD_WORD], np.zeros([1, 300]))\n","\n"," # 保存预训练模型为tensor格式, 以便于后续训练\n"," weight = torch.Tensor(word_vec.vectors)\n"," torch.save(weight, out_path)\n"," return word_vec\n","\n","\n","def load_embedding_weights(in_path=path + 'embedding_weight.pt'):\n"," return torch.load(in_path)\n","\n","\n","def get_reviews_in_idx(data, word_vec):\n"," def review2wid(review):\n"," wids = []\n"," for word in review.split():\n"," if word in word_vec:\n"," wid = word_vec.vocab[word].index\n"," else:\n"," wid = word_vec.vocab[PAD_WORD].index\n"," wids.append(wid)\n"," return wids\n","\n"," data['review'] = data['review'].apply(review2wid)\n"," review_by_user = dict(list(data[['itemID', 'review']].groupby(data['userID'])))\n"," review_by_item = dict(list(data[['userID', 'review']].groupby(data['itemID'])))\n"," return review_by_user, review_by_item\n","\n","\n","def get_max_review_length(data, percentile=0.85):\n"," review_lengths = data['review'].apply(lambda review: len(review.split()))\n"," max_length = int(review_lengths.quantile(percentile, interpolation='lower'))\n"," return max_length\n","\n","\n","def get_max_review_count(data, percentile=0.85):\n"," review_count_user = data['review'].groupby(data['userID']).count()\n"," review_count_user = int(review_count_user.quantile(percentile, interpolation='lower'))\n","\n"," review_count_item = data['review'].groupby(data['itemID']).count()\n"," review_count_item = int(review_count_item.quantile(percentile, interpolation='lower'))\n","\n"," return max(review_count_user, review_count_item)\n","\n","\n","def get_max_user_id(data):\n"," return max(data['userID'])\n","\n","\n","def get_max_item_id(data):\n"," return max(data['itemID'])\n","\n","\n","def save_review_dict(data, word_vec, data_type):\n"," user_review, item_review = get_reviews_in_idx(data, word_vec)\n"," pickle.dump(user_review, open(path + 'user_review_word_idx_{}.p'.format(data_type), 'wb'))\n"," pickle.dump(item_review, open(path + 'item_review_word_idx_{}.p'.format(data_type), 'wb'))\n","\n","\n","def get_review_dict(data_type):\n"," user_review = pickle.load(open(path + 'user_review_word_idx_{}.p'.format(data_type), 'rb'))\n"," item_review = pickle.load(open(path + 'item_review_word_idx_{}.p'.format(data_type), 'rb'))\n"," return user_review, item_review\n","\n","\n","def main():\n"," process_raw_data(path + 'Musical_Instruments_5.json', path + 'reviews.json')\n"," df = pd.read_json(path + 'reviews.json', lines=True)\n"," train, test = train_test_split(df, test_size=0.2, random_state=3)\n"," train, dev = train_test_split(train, test_size=0.2, random_state=4)\n"," known_data = pd.concat([train, dev], axis=0)\n"," all_data = pd.concat([train, dev, test], axis=0)\n","\n"," print('max review length is {}'.format(get_max_review_length(all_data)))\n"," print('max review count is {}'.format(get_max_review_count(all_data)))\n"," print('max user id is {}'.format(get_max_user_id(all_data)))\n"," print('max item id is {}'.format(get_max_item_id(all_data)))\n","\n"," word_vec = get_word_vec()\n","\n"," save_review_dict(known_data, word_vec, 'train')\n"," save_review_dict(all_data, word_vec, 'test')\n","\n","\n","if __name__ == '__main__':\n"," main()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vSPtZeoJQ6cz","outputId":"3bb94082-1307-4326-a4c8-cbc483f6339e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["max review length is 78\n","max review count is 16\n","max user id is 1428\n","max item id is 899\n"]}]}]} |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-13-personalize-cleanup.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P334820%20%7C%20Amazon%20Personalize%20Generic%20Module%20-%20Cleanup%20Layer.ipynb","timestamp":1644610448892}],"collapsed_sections":[],"mount_file_id":"1XBg8KLacHanAoOx3jj1ow-YntdpLhzt6","authorship_tag":"ABX9TyPoEKIFTqX9As2uD2zQRVLR"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Amazon Personalize Generic Module - Cleanup Layer"],"metadata":{"id":"pvcZdiGza8se"}},{"cell_type":"code","metadata":{"id":"NYcJc82W3ivV"},"source":["import boto3\n","import json\n","import time\n","\n","\n","class personalize_cleanup:\n"," def __init__(self):\n"," self._setup_connection()\n","\n"," def _setup_connection(self):\n"," try:\n"," self.personalize = boto3.client('personalize')\n"," print(\"SUCCESS | We can communicate with Personalize!\")\n"," except:\n"," print(\"ERROR | Connection can't be established!\")\n","\n"," def delete_campaign(self, campaign_arn):\n"," self.personalize.delete_campaign(campaignArn = campaign_arn)\n"," \n"," def delete_solution(self, solution_arn):\n"," self.personalize.delete_solution(solutionArn = solution_arn)\n"," \n"," def delete_tracker(self, tracker_arn):\n"," self.personalize.delete_event_tracker(eventTrackerArn = tracker_arn)\n"," \n"," def delete_filter(self, filter_arn):\n"," self.personalize.delete_filter(filterArn = filter_arn)\n"," \n"," def delete_dataset(self, dataset_arn):\n"," self.personalize.delete_dataset(datasetArn = dataset_arn)\n","\n"," def delete_schema(self, schema_arn):\n"," self.personalize.delete_schema(schemaArn = schema_arn)\n","\n"," def delete_dataset_group(self, dataset_group_arn):\n"," self.personalize.delete_dataset_group(datasetGroupArn = dataset_group_arn)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"47lrGWcD910V","executionInfo":{"status":"ok","timestamp":1630426082282,"user_tz":-330,"elapsed":8,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"18088fa3-e03b-42b4-9d43-82a9c95227fb"},"source":["pc = personalize_cleanup()"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["SUCCESS | We can communicate with Personalize!\n"]}]},{"cell_type":"code","metadata":{"id":"ZS2inCBa-TCw"},"source":["pc.delete_dataset_group(dataset_group_arn = 'arn:aws:personalize:us-east-1:746888961694:dataset-group/personalize-poc-movielens')"],"execution_count":null,"outputs":[]}]} |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.