Skip to content

Commit

Permalink
Merge pull request #13 from recohut/stage
Browse files Browse the repository at this point in the history
commit
  • Loading branch information
sparsh-ai authored Feb 11, 2022
2 parents cc146c8 + ae5bd73 commit ca5de38
Show file tree
Hide file tree
Showing 15 changed files with 15 additions and 1 deletion.
2 changes: 1 addition & 1 deletion _notebooks/2022-01-12-slist-yoochoose.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-ae-ml.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-baseliness-yoochoose.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-cab-mind.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-deepconn.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-13-deepconn.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P330133%20%7C%20DeepCoNN%20on%20Amazon%20Music%20Instruments%20in%20PyTorch.ipynb","timestamp":1644610385849},{"file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","timestamp":1640329037065}],"collapsed_sections":[],"mount_file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","authorship_tag":"ABX9TyM2tK6zA8IMzwgFJjeAgFJH"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# DeepCoNN on Amazon Music Instruments in PyTorch"],"metadata":{"id":"2VZ8fr0qMt5t"}},{"cell_type":"code","source":["!wget -q --show-progress https://github.com/RecoHut-Datasets/amazon_music_instruments/raw/v1/Musical_Instruments_5.json"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"L60wRz3KLutF","executionInfo":{"status":"ok","timestamp":1641537515379,"user_tz":-330,"elapsed":1220,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"2e592bdd-5451-4999-9b69-86d8fefaeaad"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\r Musical_I 0%[ ] 0 --.-KB/s \rMusical_Instruments 100%[===================>] 7.10M --.-KB/s in 0.03s \n"]}]},{"cell_type":"code","source":["!wget -q --show-progress https://github.com/huangjunheng/recommendation_model/raw/master/DeepCoNN/data/embedding_data/stopwords.txt\n","!wget -q --show-progress https://github.com/huangjunheng/recommendation_model/raw/master/DeepCoNN/data/embedding_data/punctuations.txt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pgm_ixhCRjdt","executionInfo":{"status":"ok","timestamp":1641537766384,"user_tz":-330,"elapsed":1239,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"219fb486-f23d-4f65-88ca-bc72002fb508"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["stopwords.txt 100%[===================>] 621 --.-KB/s in 0s \n","punctuations.txt 100%[===================>] 57 --.-KB/s in 0s \n"]}]},{"cell_type":"code","source":["!wget -q --show-progress -c \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\"\n","!gzip -d GoogleNews-vectors-negative300.bin.gz"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vcYWhWrlR0Na","executionInfo":{"status":"ok","timestamp":1641537933311,"user_tz":-330,"elapsed":95967,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"0ca55c5a-3290-4965-e73f-dfb1904343bf"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["GoogleNews-vectors- 100%[===================>] 1.53G 48.0MB/s in 29s \n"]}]},{"cell_type":"markdown","source":["## Preprocessing"],"metadata":{"id":"_tqBBPOQQySy"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import torch\n","import nltk\n","from sklearn.model_selection import train_test_split\n","from gensim.models import KeyedVectors\n","import pickle\n","from gensim.models.keyedvectors import Word2VecKeyedVectors"],"metadata":{"id":"vt6fSpGqQ2La"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["nltk.download('wordnet')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2LL5FMAARKw6","executionInfo":{"status":"ok","timestamp":1641538034166,"user_tz":-330,"elapsed":1545,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"d2529bc2-b793-4cf7-b3cb-ad70bd0285a8"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[nltk_data] Downloading package wordnet to /root/nltk_data...\n","[nltk_data] Unzipping corpora/wordnet.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["path = '/content/'\n","PAD_WORD = '<pad>'\n","PAD_WORD_ID = 3000000\n","WORD_EMBEDDINF_SIZE = 300"],"metadata":{"id":"fhuePKLCQ32f"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def process_raw_data(in_path, out_path):\n"," df = pd.read_json(in_path, lines=True)\n"," df = df[['reviewerID', 'asin', 'reviewText', 'overall']]\n"," df.columns = ['userID', 'itemID', 'review', 'rating']\n","\n"," df['userID'] = df.groupby(df['userID']).ngroup()\n"," df['itemID'] = df.groupby(df['itemID']).ngroup()\n","\n"," with open('stopwords.txt') as f:\n"," stop_words = set(f.read().splitlines())\n","\n"," with open('punctuations.txt') as f:\n"," punctuations = set(f.read().splitlines())\n","\n"," def clean_review(review):\n"," lemmatizer = nltk.WordNetLemmatizer()\n"," review = review.lower()\n"," for p in punctuations:\n"," review = review.replace(p, ' ')\n"," tokens = review.split()\n"," tokens = [word for word in tokens if word not in stop_words]\n"," # 词形归并 词干提取\n"," tokens = [lemmatizer.lemmatize(word) for word in tokens]\n"," return ' '.join(tokens)\n","\n"," df['review'] = df['review'].apply(clean_review)\n"," df.to_json(out_path, orient='records', lines=True)\n","\n","\n","def get_word_vec():\n"," # 加载预训练词嵌入模型\n"," in_path = 'GoogleNews-vectors-negative300.bin'\n"," out_path = path + 'embedding_weight.pt'\n"," word_vec = KeyedVectors.load_word2vec_format(in_path, binary=True)\n"," word_vec.add([PAD_WORD], np.zeros([1, 300]))\n","\n"," # 保存预训练模型为tensor格式, 以便于后续训练\n"," weight = torch.Tensor(word_vec.vectors)\n"," torch.save(weight, out_path)\n"," return word_vec\n","\n","\n","def load_embedding_weights(in_path=path + 'embedding_weight.pt'):\n"," return torch.load(in_path)\n","\n","\n","def get_reviews_in_idx(data, word_vec):\n"," def review2wid(review):\n"," wids = []\n"," for word in review.split():\n"," if word in word_vec:\n"," wid = word_vec.vocab[word].index\n"," else:\n"," wid = word_vec.vocab[PAD_WORD].index\n"," wids.append(wid)\n"," return wids\n","\n"," data['review'] = data['review'].apply(review2wid)\n"," review_by_user = dict(list(data[['itemID', 'review']].groupby(data['userID'])))\n"," review_by_item = dict(list(data[['userID', 'review']].groupby(data['itemID'])))\n"," return review_by_user, review_by_item\n","\n","\n","def get_max_review_length(data, percentile=0.85):\n"," review_lengths = data['review'].apply(lambda review: len(review.split()))\n"," max_length = int(review_lengths.quantile(percentile, interpolation='lower'))\n"," return max_length\n","\n","\n","def get_max_review_count(data, percentile=0.85):\n"," review_count_user = data['review'].groupby(data['userID']).count()\n"," review_count_user = int(review_count_user.quantile(percentile, interpolation='lower'))\n","\n"," review_count_item = data['review'].groupby(data['itemID']).count()\n"," review_count_item = int(review_count_item.quantile(percentile, interpolation='lower'))\n","\n"," return max(review_count_user, review_count_item)\n","\n","\n","def get_max_user_id(data):\n"," return max(data['userID'])\n","\n","\n","def get_max_item_id(data):\n"," return max(data['itemID'])\n","\n","\n","def save_review_dict(data, word_vec, data_type):\n"," user_review, item_review = get_reviews_in_idx(data, word_vec)\n"," pickle.dump(user_review, open(path + 'user_review_word_idx_{}.p'.format(data_type), 'wb'))\n"," pickle.dump(item_review, open(path + 'item_review_word_idx_{}.p'.format(data_type), 'wb'))\n","\n","\n","def get_review_dict(data_type):\n"," user_review = pickle.load(open(path + 'user_review_word_idx_{}.p'.format(data_type), 'rb'))\n"," item_review = pickle.load(open(path + 'item_review_word_idx_{}.p'.format(data_type), 'rb'))\n"," return user_review, item_review\n","\n","\n","def main():\n"," process_raw_data(path + 'Musical_Instruments_5.json', path + 'reviews.json')\n"," df = pd.read_json(path + 'reviews.json', lines=True)\n"," train, test = train_test_split(df, test_size=0.2, random_state=3)\n"," train, dev = train_test_split(train, test_size=0.2, random_state=4)\n"," known_data = pd.concat([train, dev], axis=0)\n"," all_data = pd.concat([train, dev, test], axis=0)\n","\n"," print('max review length is {}'.format(get_max_review_length(all_data)))\n"," print('max review count is {}'.format(get_max_review_count(all_data)))\n"," print('max user id is {}'.format(get_max_user_id(all_data)))\n"," print('max item id is {}'.format(get_max_item_id(all_data)))\n","\n"," word_vec = get_word_vec()\n","\n"," save_review_dict(known_data, word_vec, 'train')\n"," save_review_dict(all_data, word_vec, 'test')\n","\n","\n","if __name__ == '__main__':\n"," main()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vSPtZeoJQ6cz","outputId":"3bb94082-1307-4326-a4c8-cbc483f6339e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["max review length is 78\n","max review count is 16\n","max user id is 1428\n","max item id is 899\n"]}]}]}
1 change: 1 addition & 0 deletions _notebooks/2022-01-13-equal-experience.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-explainable-bpr-lastfm.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-fm-torch.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-grouprec.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-listwise-ml.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-listwise-retail.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-personalize-cleanup.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-13-personalize-cleanup.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P334820%20%7C%20Amazon%20Personalize%20Generic%20Module%20-%20Cleanup%20Layer.ipynb","timestamp":1644610448892}],"collapsed_sections":[],"mount_file_id":"1XBg8KLacHanAoOx3jj1ow-YntdpLhzt6","authorship_tag":"ABX9TyPoEKIFTqX9As2uD2zQRVLR"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Amazon Personalize Generic Module - Cleanup Layer"],"metadata":{"id":"pvcZdiGza8se"}},{"cell_type":"code","metadata":{"id":"NYcJc82W3ivV"},"source":["import boto3\n","import json\n","import time\n","\n","\n","class personalize_cleanup:\n"," def __init__(self):\n"," self._setup_connection()\n","\n"," def _setup_connection(self):\n"," try:\n"," self.personalize = boto3.client('personalize')\n"," print(\"SUCCESS | We can communicate with Personalize!\")\n"," except:\n"," print(\"ERROR | Connection can't be established!\")\n","\n"," def delete_campaign(self, campaign_arn):\n"," self.personalize.delete_campaign(campaignArn = campaign_arn)\n"," \n"," def delete_solution(self, solution_arn):\n"," self.personalize.delete_solution(solutionArn = solution_arn)\n"," \n"," def delete_tracker(self, tracker_arn):\n"," self.personalize.delete_event_tracker(eventTrackerArn = tracker_arn)\n"," \n"," def delete_filter(self, filter_arn):\n"," self.personalize.delete_filter(filterArn = filter_arn)\n"," \n"," def delete_dataset(self, dataset_arn):\n"," self.personalize.delete_dataset(datasetArn = dataset_arn)\n","\n"," def delete_schema(self, schema_arn):\n"," self.personalize.delete_schema(schemaArn = schema_arn)\n","\n"," def delete_dataset_group(self, dataset_group_arn):\n"," self.personalize.delete_dataset_group(datasetGroupArn = dataset_group_arn)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"47lrGWcD910V","executionInfo":{"status":"ok","timestamp":1630426082282,"user_tz":-330,"elapsed":8,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"18088fa3-e03b-42b4-9d43-82a9c95227fb"},"source":["pc = personalize_cleanup()"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["SUCCESS | We can communicate with Personalize!\n"]}]},{"cell_type":"code","metadata":{"id":"ZS2inCBa-TCw"},"source":["pc.delete_dataset_group(dataset_group_arn = 'arn:aws:personalize:us-east-1:746888961694:dataset-group/personalize-poc-movielens')"],"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions _notebooks/2022-01-13-personalize-inference.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-spotify-knn.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-13-tfrs-electronics.ipynb

Large diffs are not rendered by default.

0 comments on commit ca5de38

Please sign in to comment.