Skip to content

Commit

Permalink
Merge pull request #12 from recohut/stage
Browse files Browse the repository at this point in the history
Stage
  • Loading branch information
sparsh-ai authored Feb 11, 2022
2 parents 6c41b71 + 1178d67 commit cc146c8
Show file tree
Hide file tree
Showing 12 changed files with 16,722 additions and 2 deletions.
16,712 changes: 16,711 additions & 1 deletion _notebooks/2022-01-07-ncf.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion _notebooks/2022-01-09-lrgccf-gowalla.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _notebooks/2022-01-10-fm-ml.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"markdown","source":["# FM on ML-100k in PyTorch"],"metadata":{"id":"OXuS4HHuT8wC"}},{"cell_type":"code","source":["!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-100k.zip\n","!unzip ml-100k.zip\n","!mv ml-100k/u.data ."],"metadata":{"id":"AJhJHCd6TT-d"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import torch\n","from torch import nn\n","from torch.utils.data import Dataset, DataLoader\n","from sklearn.utils import shuffle\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import mean_squared_error\n","from tqdm import tqdm\n","\n","learning_rate = 1e-3\n","weight_decay = 1e-4\n","epochs = 10\n","batch_szie = 1024\n","min_val, max_val = 1.0, 5.0\n","device = torch.device('cpu')\n","id_embedding_dim = 256 # id嵌入向量的长度\n","\n","\n","# fm模型\n","class FmLayer(nn.Module):\n","\n"," def __init__(self, p, k):\n"," super(FmLayer, self).__init__()\n"," self.p, self.k = p, k\n"," self.linear = nn.Linear(self.p, 1, bias=True)\n"," self.v = nn.Parameter(torch.Tensor(self.p, self.k), requires_grad=True)\n"," self.v.data.uniform_(-0.01, 0.01)\n"," self.drop = nn.Dropout(0.2)\n","\n"," def forward(self, x):\n"," linear_part = self.linear(x)\n"," inter_part1 = torch.pow(torch.mm(x, self.v), 2)\n"," inter_part2 = torch.mm(torch.pow(x, 2), torch.pow(self.v, 2))\n"," pair_interactions = torch.sum(torch.sub(inter_part1, inter_part2), dim=1)\n"," self.drop(pair_interactions)\n"," output = linear_part.transpose(1, 0) + 0.5 * pair_interactions\n"," return output.view(-1, 1)\n","\n","\n","class FM(nn.Module):\n"," def __init__(self, user_nums, item_nums, id_embedding_dim):\n"," super(FM, self).__init__()\n"," # 对用户, 物品的id进行嵌入\n"," self.user_id_vec = nn.Embedding(user_nums, id_embedding_dim)\n"," self.item_id_vec = nn.Embedding(item_nums, id_embedding_dim)\n","\n"," self.fm = FmLayer(id_embedding_dim * 2, 10)\n","\n"," def forward(self, u_id, i_id):\n"," u_vec = self.user_id_vec(u_id)\n"," i_vec = self.item_id_vec(i_id)\n"," x = torch.cat((u_vec, i_vec), dim=1)\n"," rate = self.fm(x)\n"," return rate\n","\n","\n","class FmDataset(Dataset):\n"," def __init__(self, uid, iid, rating):\n"," self.uid = uid\n"," self.iid = iid\n"," self.rating = rating\n","\n"," def __getitem__(self, index):\n"," return self.uid[index], self.iid[index], self.rating[index]\n","\n"," def __len__(self):\n"," return len(self.uid)\n","\n","\n","def train_iter(model, optimizer, data_loder, criterion):\n"," model.train()\n"," total_loss = 0\n"," total_len = 0\n","\n"," for index, (x_u, x_i, y) in enumerate(data_loder):\n"," x_u, x_i, y = x_u.to(device), x_i.to(device), y.to(device)\n"," y = (y - min_val) / (max_val - min_val) + 0.01\n"," y_pre = model(x_u, x_i)\n","\n"," loss = criterion(y.view(-1, 1), y_pre)\n"," optimizer.zero_grad()\n"," loss.backward()\n"," optimizer.step()\n","\n"," total_loss += loss.item() * len(y_pre)\n"," total_len += len(y_pre)\n","\n"," loss = total_loss / total_len\n"," return loss\n","\n","\n","def val_iter(model, data_loader):\n"," model.eval()\n"," labels, predicts = list(), list()\n","\n"," with torch.no_grad():\n"," for x_u, x_i, y in data_loader:\n"," x_u, x_i, y = x_u.to(device), x_i.to(device), y.to(device)\n"," y_pre = model(x_u, x_i)\n"," y_pre = min_val + (y_pre - 0.01) * (max_val - min_val)\n"," y_pre = torch.where(y_pre > 5.0, torch.full_like(y_pre, 5.0), y_pre)\n"," y_pre = torch.where(y_pre < 1.0, torch.full_like(y_pre, 1.0), y_pre)\n"," labels.extend(y.tolist())\n"," predicts.extend(y_pre.tolist())\n"," mse = mean_squared_error(np.array(labels), np.array(predicts))\n","\n"," return mse\n","\n","\n","def main():\n"," df = pd.read_csv('u.data', header=None, delimiter='\\t')\n"," len_df, u_max_id, i_max_id = len(df), max(df[0]) + 1, max(df[1]) + 1\n"," print(df.shape, max(df[0]), max(df[1]))\n"," x, y = df.iloc[:, :2], df.iloc[:, 2]\n"," x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2020)\n"," x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=2020)\n"," train_loader = DataLoader(\n"," FmDataset(np.array(x_train[0]), np.array(x_train[1]), np.array(y_train).astype(np.float32)), batch_size=batch_szie)\n"," val_loader = DataLoader(FmDataset(np.array(x_val[0]), np.array(x_val[1]), np.array(y_val).astype(np.float32)), batch_size=batch_szie)\n"," test_loader = DataLoader(FmDataset(np.array(x_test[0]), np.array(x_test[1]), np.array(y_test).astype(np.float32)), batch_size=batch_szie)\n","\n"," # 模型初始化\n"," model = FM(u_max_id, i_max_id, id_embedding_dim).to(device)\n"," optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n"," loss_func = torch.nn.MSELoss().to(device)\n","\n"," # 训练模型\n"," best_val_mse, best_val_epoch = 10, 0\n"," for epoch in range(epochs):\n"," loss = train_iter(model, optimizer, train_loader, loss_func)\n"," mse = val_iter(model, val_loader)\n"," print(\"epoch:{}, loss:{:.5}, mse:{:.5}\".format(epoch, loss, mse))\n"," if best_val_mse > mse:\n"," best_val_mse, best_val_epoch = mse, epoch\n"," torch.save(model, 'best_model')\n"," print(\"best val epoch is {}, mse is {}\".format(best_val_epoch, best_val_mse))\n"," model = torch.load('best_model').to(device)\n"," test_mse = val_iter(model, test_loader)\n"," print(\"test mse is {}\".format(test_mse))\n","\n","\n","if __name__ == '__main__':\n"," main()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"q8rEahLvTbqF","executionInfo":{"status":"ok","timestamp":1641538334270,"user_tz":-330,"elapsed":12873,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"20358627-dede-48d7-8a5f-88236d55aaae"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["(100000, 4) 943 1682\n","epoch:0, loss:0.2471, mse:1.7855\n","epoch:1, loss:0.095476, mse:1.3192\n","epoch:2, loss:0.074207, mse:1.1511\n","epoch:3, loss:0.066409, mse:1.0806\n","epoch:4, loss:0.062901, mse:1.046\n","epoch:5, loss:0.060974, mse:1.0257\n","epoch:6, loss:0.059704, mse:1.0118\n","epoch:7, loss:0.058753, mse:1.0015\n","epoch:8, loss:0.057982, mse:0.99323\n","epoch:9, loss:0.057325, mse:0.98636\n","best val epoch is 9, mse is 0.9863620211966587\n","test mse is 0.993012835364686\n"]}]},{"cell_type":"markdown","source":["---"],"metadata":{"id":"rsNyAAaUT5lA"}},{"cell_type":"code","source":["!pip install -q watermark\n","%reload_ext watermark\n","%watermark -a \"Sparsh A.\" -m -iv -u -t -d"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Kk0V69zDT5lE","executionInfo":{"status":"ok","timestamp":1641538388085,"user_tz":-330,"elapsed":4265,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"5dfb8942-7c65-4d4f-c546-382b6a4f04af"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Author: Sparsh A.\n","\n","Last updated: 2022-01-07 06:53:08\n","\n","Compiler : GCC 7.5.0\n","OS : Linux\n","Release : 5.4.144+\n","Machine : x86_64\n","Processor : x86_64\n","CPU cores : 2\n","Architecture: 64bit\n","\n","pandas : 1.1.5\n","numpy : 1.19.5\n","IPython: 5.5.0\n","torch : 1.10.0+cu111\n","\n"]}]},{"cell_type":"markdown","source":["---"],"metadata":{"id":"Nyd-d0uGT5lG"}},{"cell_type":"markdown","source":["**END**"],"metadata":{"id":"rVlR650LT5lG"}}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"colab":{"name":"2022-01-10-fm-ml.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P176240%20%7C%20FM%20on%20ML-100k%20in%20PyTorch.ipynb","timestamp":1644598926871},{"file_id":"1Qbw_3372DrLKATKz_66dgUd5EdA_8lwh","timestamp":1641538393107},{"file_id":"1npp4hgFBQRflbqyRW4TayaUFIJJA_xHo","timestamp":1639737398037},{"file_id":"1vh6Mr1C7uh08K4zR4B2VIfqsK22pkraT","timestamp":1639730564985},{"file_id":"1F1wdk7jG5W0jbVM1nZBOPV0TYuSO7frV","timestamp":1639730030880},{"file_id":"https://github.com/RecoHut-Projects/recohut/blob/S394070/nbs/models/tensorflow/deepmf.ipynb","timestamp":1639729505410}],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":0}
Loading

0 comments on commit cc146c8

Please sign in to comment.