Temitope Oladokun

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Federated Learning & RaspberryPi ",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/shashigharti/federated-learning-on-raspberry-pi/blob/master/Federated_Learning_%26_RaspberryPi.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NFlWUbBa59YR",
        "colab_type": "code",
        "outputId": "5d7c65bd-2143-48ed-9b0c-5a59db9290a8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        }
      },
      "source": [
        "!pip install tf-encrypted\n",
        "\n",
        "! URL=\"https://github.com/openmined/PySyft.git\" && FOLDER=\"PySyft\" && if [ ! -d $FOLDER ]; then git clone -b dev --single-branch $URL; else (cd $FOLDER && git pull $URL && cd ..); fi;\n",
        "\n",
        "!cd PySyft; python setup.py install  > /dev/null\n",
        "\n",
        "import os\n",
        "import sys\n",
        "module_path = os.path.abspath(os.path.join('./PySyft'))\n",
        "if module_path not in sys.path:\n",
        "    sys.path.append(module_path)\n",
        "    \n",
        "!pip install --upgrade --force-reinstall lz4\n",
        "!pip install --upgrade --force-reinstall websocket\n",
        "!pip install --upgrade --force-reinstall websockets\n",
        "!pip install --upgrade --force-reinstall zstd"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting tf-encrypted\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/1f/82/cf15aeac92525da2f794956712e7ebf418819390dec783430ee242b52d0b/tf_encrypted-0.5.8-py3-none-manylinux1_x86_64.whl (2.1MB)\n",
            "\u001b[K     |████████████████████████████████| 2.1MB 2.8MB/s \n",
            "\u001b[?25hRequirement already satisfied: tensorflow<2,>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tf-encrypted) (1.14.0)\n",
            "Collecting pyyaml>=5.1 (from tf-encrypted)\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265kB)\n",
            "\u001b[K     |████████████████████████████████| 266kB 46.3MB/s \n",
            "\u001b[?25hRequirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.6/dist-packages (from tf-encrypted) (1.16.4)\n",
            "Requirement already satisfied: tensorboard<1.15.0,>=1.14.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.14.0)\n",
            "Requirement already satisfied: tensorflow-estimator<1.15.0rc0,>=1.14.0rc0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.14.0)\n",
            "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.0.8)\n",
            "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.15.0)\n",
            "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.1.0)\n",
            "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.1.0)\n",
            "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (0.33.4)\n",
            "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.12.0)\n",
            "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (1.11.2)\n",
            "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (0.7.1)\n",
            "Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (0.2.2)\n",
            "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (0.1.7)\n",
            "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (3.7.1)\n",
            "Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2,>=1.12.0->tf-encrypted) (0.8.0)\n",
            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow<2,>=1.12.0->tf-encrypted) (3.1.1)\n",
            "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow<2,>=1.12.0->tf-encrypted) (41.0.1)\n",
            "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow<2,>=1.12.0->tf-encrypted) (0.15.5)\n",
            "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tensorflow<2,>=1.12.0->tf-encrypted) (2.8.0)\n",
            "Building wheels for collected packages: pyyaml\n",
            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for pyyaml: filename=PyYAML-5.1.2-cp36-cp36m-linux_x86_64.whl size=44105 sha256=d43e3214bccf1bb6bb5b0918463fbb4e1d2d953614c7e947965eaf34246f4233\n",
            "  Stored in directory: /root/.cache/pip/wheels/d9/45/dd/65f0b38450c47cf7e5312883deb97d065e030c5cca0a365030\n",
            "Successfully built pyyaml\n",
            "Installing collected packages: pyyaml, tf-encrypted\n",
            "  Found existing installation: PyYAML 3.13\n",
            "    Uninstalling PyYAML-3.13:\n",
            "      Successfully uninstalled PyYAML-3.13\n",
            "Successfully installed pyyaml-5.1.2 tf-encrypted-0.5.8\n",
            "Cloning into 'PySyft'...\n",
            "remote: Enumerating objects: 30364, done.\u001b[K\n",
            "remote: Total 30364 (delta 0), reused 0 (delta 0), pack-reused 30364\u001b[K\n",
            "Receiving objects: 100% (30364/30364), 33.00 MiB | 23.42 MiB/s, done.\n",
            "Resolving deltas: 100% (20363/20363), done.\n",
            "zip_safe flag not set; analyzing archive contents...\n",
            "zip_safe flag not set; analyzing archive contents...\n",
            "__pycache__.zstd.cpython-36: module references __file__\n",
            "Collecting lz4\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/0a/c6/96bbb3525a63ebc53ea700cc7d37ab9045542d33b4d262d0f0408ad9bbf2/lz4-2.1.10-cp36-cp36m-manylinux1_x86_64.whl (385kB)\n",
            "\u001b[K     |████████████████████████████████| 389kB 2.8MB/s \n",
            "\u001b[31mERROR: syft 0.1.23a1 has requirement msgpack>=0.6.1, but you'll have msgpack 0.5.6 which is incompatible.\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: lz4\n",
            "  Found existing installation: lz4 2.1.10\n",
            "    Uninstalling lz4-2.1.10:\n",
            "      Successfully uninstalled lz4-2.1.10\n",
            "Successfully installed lz4-2.1.10\n",
            "Collecting websocket\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f2/6d/a60d620ea575c885510c574909d2e3ed62129b121fa2df00ca1c81024c87/websocket-0.2.1.tar.gz (195kB)\n",
            "\u001b[K     |████████████████████████████████| 204kB 2.8MB/s \n",
            "\u001b[?25hCollecting gevent (from websocket)\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f2/ca/5b5962361ed832847b6b2f9a2d0452c8c2f29a93baef850bb8ad067c7bf9/gevent-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)\n",
            "\u001b[K     |████████████████████████████████| 5.5MB 46.1MB/s \n",
            "\u001b[?25hCollecting greenlet (from websocket)\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bf/45/142141aa47e01a5779f0fa5a53b81f8379ce8f2b1cd13df7d2f1d751ae42/greenlet-0.4.15-cp36-cp36m-manylinux1_x86_64.whl (41kB)\n",
            "\u001b[K     |████████████████████████████████| 51kB 22.0MB/s \n",
            "\u001b[?25hBuilding wheels for collected packages: websocket\n",
            "  Building wheel for websocket (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for websocket: filename=websocket-0.2.1-cp36-none-any.whl size=192134 sha256=46d452d7b290beb083eccb732aa65d8fef99d0db9514cdb5d68eeb0e6afb25ef\n",
            "  Stored in directory: /root/.cache/pip/wheels/35/f7/5c/9e8243838269ea93f05295708519a6e183fa6b515d9ce3b636\n",
            "Successfully built websocket\n",
            "Installing collected packages: greenlet, gevent, websocket\n",
            "  Found existing installation: greenlet 0.4.15\n",
            "    Uninstalling greenlet-0.4.15:\n",
            "      Successfully uninstalled greenlet-0.4.15\n",
            "  Found existing installation: gevent 1.4.0\n",
            "    Uninstalling gevent-1.4.0:\n",
            "      Successfully uninstalled gevent-1.4.0\n",
            "Successfully installed gevent-1.4.0 greenlet-0.4.15 websocket-0.2.1\n",
            "Collecting websockets\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f0/4b/ad228451b1c071c5c52616b7d4298ebcfcac5ae8515ede959db19e4cd56d/websockets-8.0.2-cp36-cp36m-manylinux1_x86_64.whl (72kB)\n",
            "\u001b[K     |████████████████████████████████| 81kB 3.2MB/s \n",
            "\u001b[31mERROR: syft 0.1.23a1 has requirement msgpack>=0.6.1, but you'll have msgpack 0.5.6 which is incompatible.\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: websockets\n",
            "  Found existing installation: websockets 8.0.2\n",
            "    Uninstalling websockets-8.0.2:\n",
            "      Successfully uninstalled websockets-8.0.2\n",
            "Successfully installed websockets-8.0.2\n",
            "Collecting zstd\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/22/37/6a7ba746ebddbd6cd06de84367515d6bc239acd94fb3e0b1c85788176ca2/zstd-1.4.1.0.tar.gz (454kB)\n",
            "\u001b[K     |████████████████████████████████| 460kB 2.7MB/s \n",
            "\u001b[?25hBuilding wheels for collected packages: zstd\n",
            "  Building wheel for zstd (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for zstd: filename=zstd-1.4.1.0-cp36-cp36m-linux_x86_64.whl size=1067098 sha256=4378524782195c5812206e3f2691909d5690c78bd1383434b34010f1c02974b4\n",
            "  Stored in directory: /root/.cache/pip/wheels/66/3f/ee/ac08c81af7c1b24a80c746df669ea3cb37542d27877d66ccf4\n",
            "Successfully built zstd\n",
            "\u001b[31mERROR: syft 0.1.23a1 has requirement msgpack>=0.6.1, but you'll have msgpack 0.5.6 which is incompatible.\u001b[0m\n",
            "Installing collected packages: zstd\n",
            "  Found existing installation: zstd 1.4.1.0\n",
            "    Uninstalling zstd-1.4.1.0:\n",
            "      Successfully uninstalled zstd-1.4.1.0\n",
            "Successfully installed zstd-1.4.1.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NbksiWKqHSKe",
        "colab_type": "text"
      },
      "source": [
        "# Importing the necessary libraries"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kq0bR3hV6qW0",
        "colab_type": "code",
        "outputId": "bdb4df99-93ec-4c90-d265-f17c75adc832",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "from __future__ import print_function\n",
        "import argparse\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import torch.optim as optim\n",
        "torch.__version__"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'1.1.0'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "l-ecdXAvagy3",
        "colab_type": "text"
      },
      "source": [
        "## Start your syft workers"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vJelG_lqaurB",
        "colab_type": "text"
      },
      "source": [
        ""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oaRPUY0aHIPo",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import syft as sy\n",
        "hook = sy.TorchHook(torch)  #hook PyTorch ie add extra functionalities to support Federated Learning\n",
        "Temitope = sy.VirtualWorker(hook, id=\"Temitope\")   #define remote worker Temitope\n",
        "#Sarah = sy.VirtualWorker(hook, id= \"Sarah\")     #and sarah\n",
        "Ayanfunke = sy.VirtualWorker(hook, id=\"Ayanfunke\")     #and Ayanfunke"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3sKG0llXa0Xr",
        "colab_type": "text"
      },
      "source": [
        "# Start your code"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9vnhBTDGZaqx",
        "colab_type": "code",
        "outputId": "56ba5ca6-6aae-413a-c255-1b6b702420bf",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "x = torch.tensor([1.]).send(Ayanfunke)\n",
        "y = (x * 2).get()\n",
        "y"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "tensor([2.])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5cSxr6US2UnE",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "pip install -r \"../../../requirements.txt\""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CRHRnKwTCylj",
        "colab_type": "code",
        "outputId": "71b883e1-66f6-4025-e132-dd84b009a309",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "from zipfile import ZipFile\n",
        "filename = \"Nigerian Names.zip\"\n",
        "\n",
        "with ZipFile(filename, 'r') as zip:\n",
        "  zip.extractall()\n",
        "  print('Done')"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Done\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HP3yfMPfddwm",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#The below code will remove data form the document in the cloud. I am leaving it commeneted till i need it\n",
        "#!rm -rf data"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "h_ZvXrHeBAE0",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#!wget https://github.com/TemitopeOladokun/FederatedLearningandRaspberryPi/tree/master/data/NigerianNames\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zgUP_5Rz2XXs",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from __future__ import unicode_literals, print_function, division\n",
        "#from torch.utils.data import Dataset\n",
        "\n",
        "\n",
        "import torch\n",
        "from io import open\n",
        "import glob\n",
        "import os\n",
        "import numpy as np\n",
        "import unicodedata\n",
        "import string\n",
        "import random\n",
        "import torch.nn as nn\n",
        "import time\n",
        "import math\n",
        "import syft as sy\n",
        "import pandas as pd\n",
        "import random\n",
        "from syft.frameworks.torch.federated import utils\n",
        "\n",
        "from syft.workers import WebsocketClientWorker\n",
        "import matplotlib.pyplot as plt\n",
        "import matplotlib.ticker as ticker"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JF2nN0li79Pt",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Load all the files in a certain path\n",
        "def findFiles(path):\n",
        "    return glob.glob(path)\n",
        "\n",
        "# Read a file and split into lines\n",
        "def readLines(filename):\n",
        "    lines = open(filename, encoding='utf-8').read().strip().split('\\n')\n",
        "    return [unicodeToAscii(line) for line in lines]\n",
        "\n",
        "#convert a string 's' in unicode format to ASCII format\n",
        "def unicodeToAscii(s):\n",
        "    return ''.join(\n",
        "        c for c in unicodedata.normalize('NFD', s)\n",
        "        if unicodedata.category(c) != 'Mn'\n",
        "        and c in all_letters\n",
        "    )\n",
        "    "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xIo8AabEDJIN",
        "colab_type": "code",
        "outputId": "53221bcd-4d22-4bce-c734-e5b2a6211544",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 153
        }
      },
      "source": [
        "all_letters = string.ascii_letters + \" .,;'\"\n",
        "n_letters = len(all_letters)\n",
        "\n",
        "#dictionary containing the nation as key and the names as values\n",
        "#Example: category_lines[\"italian\"] = [\"Abandonato\",\"Abatangelo\",\"Abatantuono\",...]\n",
        "category_lines = {}\n",
        "#List containing the different categories in the data\n",
        "all_categories = []\n",
        "\n",
        "print (\"This dataset has Nigerian Names\" + \"\\n\")\n",
        "#print(\"Amount of categories:\" + str(n_categories) + \"\\n\")\n",
        "\n",
        "for filename in findFiles('Nigerian Names/*.txt'):\n",
        "    print(filename) \n",
        "    category = os.path.splitext(os.path.basename(filename))[0]\n",
        "    all_categories.append(category)\n",
        "    lines = readLines(filename)\n",
        "    category_lines[category] = lines   \n",
        "    \n",
        "n_categories = len(all_categories)\n",
        "\n",
        "#print (\"This dataset has Nigerian names included\" + \"\\n\")\n",
        "print(\"Amount of categories:\" + str(n_categories))"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "This dataset has Nigerian Names\n",
            "\n",
            "Nigerian Names/Edo.txt\n",
            "Nigerian Names/Urhobo.txt\n",
            "Nigerian Names/Hausa.txt\n",
            "Nigerian Names/Yoruba.txt\n",
            "Nigerian Names/Igbo.txt\n",
            "Amount of categories:5\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vQffmR5yLJ0L",
        "colab_type": "code",
        "outputId": "e6c69630-c3b6-42aa-ef0f-c36c0bce908f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 323
        }
      },
      "source": [
        "print(\"These are Yoruba names\")\n",
        "print(*category_lines['Yoruba'][:10] ,  sep = \"\\n\")\n",
        "print(\"\\n\")\n",
        "print(\"These are Urhobo names\")\n",
        "print(*category_lines['Urhobo'][:4],  sep = \"\\n\")"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "These are Yoruba names\n",
            "Temitope\n",
            "Adeola\n",
            "Ayanfunke\n",
            "Eyitayo\n",
            "Ayodele\n",
            "Oyeleke\n",
            "Funke\n",
            "Olayemi\n",
            "Damilola\n",
            "Oluwaseun\n",
            "\n",
            "\n",
            "These are Urhobo names\n",
            "Ejiro\n",
            "Efetobo\n",
            "Anaborhi\n",
            "Edewor\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "j-8sqnGyJEpH",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class LanguageDataset(Dataset):\n",
        "    #Constructor is mandatory\n",
        "        def __init__(self, text, labels, transform=None):\n",
        "            self.data = text\n",
        "            self.targets = labels #categories\n",
        "            #self.to_torchtensor()\n",
        "            self.transform = transform\n",
        "        \n",
        "        def to_torchtensor(self):            \n",
        "            self.data = torch.from_numpy(self.text, requires_grad=True)\n",
        "            self.labels = torch.from_numpy(self.targets, requires_grad=True)\n",
        "        \n",
        "        def __len__(self):\n",
        "            #Mandatory\n",
        "            '''Returns:\n",
        "                    Length [int]: Length of Dataset/batches\n",
        "            '''\n",
        "            return len(self.data)\n",
        "    \n",
        "        def __getitem__(self, idx): \n",
        "            #Mandatory \n",
        "            \n",
        "            '''Returns:\n",
        "                     Data [Torch Tensor]: \n",
        "                     Target [ Torch Tensor]:\n",
        "            '''\n",
        "            sample = self.data[idx]\n",
        "            target = self.targets[idx]\n",
        "                    \n",
        "            if self.transform:\n",
        "                sample = self.transform(sample)\n",
        "    \n",
        "            return sample,target"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OI7az0p0K2_0",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#The list of arguments for our program. We will be needing most of them soon.\n",
        "class Arguments():\n",
        "    def __init__(self):\n",
        "        self.batch_size = 1\n",
        "        self.learning_rate = 0.005\n",
        "        self.epochs = 10000\n",
        "        self.federate_after_n_batches = 15000\n",
        "        self.seed = 1\n",
        "        self.print_every = 200\n",
        "        self.plot_every = 100\n",
        "        self.use_cuda = False\n",
        "        \n",
        "args = Arguments()\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4XZSzKp4o4Up",
        "colab_type": "code",
        "outputId": "da68bcf0-3a9e-4fea-c1b4-15be62997900",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "%%latex\n",
        "\n",
        "\\begin{split}\n",
        "names\\_list = [d_1,...,d_n]  \\\\\n",
        "\n",
        "category\\_list = [c_1,...,c_n] \n",
        "\\end{split}\n",
        "\n",
        "\n",
        "Where $n$ is the total amount of data points"
      ],
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/latex": "\n\\begin{split}\nnames\\_list = [d_1,...,d_n]  \\\\\n\ncategory\\_list = [c_1,...,c_n] \n\\end{split}\n\n\nWhere $n$ is the total amount of data points",
            "text/plain": [
              "<IPython.core.display.Latex object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "I-_MgMJv0j4a",
        "colab_type": "code",
        "outputId": "df77c131-2c50-48d3-be0f-8733a1a11731",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 122
        }
      },
      "source": [
        "#Set of names(X)\n",
        "names_list = []\n",
        "#Set of labels (Y)\n",
        "category_list = []\n",
        "\n",
        "#Convert into a list with corresponding label.\n",
        "\n",
        "for nation, names in category_lines.items():\n",
        "    #iterate over every single name\n",
        "    for name in names:\n",
        "        names_list.append(name)      #input data point\n",
        "        category_list.append(nation) #label\n",
        "        \n",
        "#let's see if it was successfully loaded. Each data sample(X) should have its own corresponding category(Y)\n",
        "print(names_list[0:20])\n",
        "print(category_list[0:20])\n",
        "\n",
        "print(\"\\n \\n Amount of data points loaded: \" + str(len(names_list)))\n",
        "\n"
      ],
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['Adesua', 'Obi', 'Ogheneme', 'Eromosele', 'Ejiro', 'Efetobo', 'Anaborhi', 'Edewor', 'Ejiroghene', 'Edafetanure', 'Efemena', 'Efemuaye', 'Efetobore', 'Etanomare', 'Etaredafe', 'Omonigho', 'Ighomuedafe', 'Ighovavwerhe', 'Omonigho', 'Omonoro']\n",
            "['Edo', 'Edo', 'Edo', 'Edo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo', 'Urhobo']\n",
            "\n",
            " \n",
            " Amount of data points loaded: 55\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BwOuKoHR01Ji",
        "colab_type": "code",
        "outputId": "45bd508d-0a33-4540-d5d2-bb7abcd4003b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 71
        }
      },
      "source": [
        "#Assign an integer to every category\n",
        "categories_numerical = pd.factorize(category_list)[0]\n",
        "#Let's wrap our categories with a tensor, so that it can be loaded by LanguageDataset\n",
        "category_tensor = torch.tensor(np.array(categories_numerical), dtype=torch.long)\n",
        "#Ready to be processed by torch.from_numpy in LanguageDataset\n",
        "categories_numpy = np.array(category_tensor)\n",
        "\n",
        "#Let's see a few resulting categories\n",
        "print(names_list[0:20])\n",
        "print(categories_numpy[0:20])\n",
        "\n"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['Adesua', 'Obi', 'Ogheneme', 'Eromosele', 'Ejiro', 'Efetobo', 'Anaborhi', 'Edewor', 'Ejiroghene', 'Edafetanure', 'Efemena', 'Efemuaye', 'Efetobore', 'Etanomare', 'Etaredafe', 'Omonigho', 'Ighomuedafe', 'Ighovavwerhe', 'Omonigho', 'Omonoro']\n",
            "[0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "teVvzmQbZ5rO",
        "colab_type": "code",
        "outputId": "9831be9d-f03c-41d1-cc4e-c8b51e653808",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 510
        }
      },
      "source": [
        "def letterToIndex(letter):\n",
        "    return all_letters.find(letter)\n",
        "    \n",
        "# Just for demonstration, turn a letter into a <1 x n_letters> Tensor\n",
        "def letterToTensor(letter):\n",
        "    tensor = torch.zeros(1, n_letters)\n",
        "    tensor[0][letterToIndex(letter)] = 1\n",
        "    return tensor\n",
        "\n",
        "# Turn a line into a <line_length x 1 x n_letters>,\n",
        "# or an array of one-hot letter vectors\n",
        "def lineToTensor(line):\n",
        "    tensor = torch.zeros(len(line), 1, n_letters) #Daniele: len(max_line_size) was len(line)\n",
        "    for li, letter in enumerate(line):\n",
        "        tensor[li][0][letterToIndex(letter)] = 1\n",
        "    #Daniele: add blank elements over here\n",
        "    return tensor    \n",
        "    \n",
        "    \n",
        "    \n",
        "def list_strings_to_list_tensors(names_list):\n",
        "    lines_tensors = []\n",
        "    for index, line in enumerate(names_list):\n",
        "        lineTensor = lineToTensor(line)\n",
        "        lineNumpy = lineTensor.numpy()\n",
        "        lines_tensors.append(lineNumpy)\n",
        "        \n",
        "    return(lines_tensors)\n",
        "\n",
        "lines_tensors = list_strings_to_list_tensors(names_list)\n",
        "\n",
        "print(names_list[40])\n",
        "print(lines_tensors[40])\n",
        "print(lines_tensors[40].shape)\n",
        "\n"
      ],
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Ololade\n",
            "[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
            "\n",
            " [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
            "\n",
            " [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
            "\n",
            " [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
            "\n",
            " [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
            "\n",
            " [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
            "\n",
            " [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
            "   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]\n",
            "(7, 1, 57)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LdCglO02aGA6",
        "colab_type": "code",
        "outputId": "0728efb4-50bb-4c08-cfbe-9c898f5aada1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        }
      },
      "source": [
        "max_line_size = max(len(x) for x in lines_tensors)\n",
        "\n",
        "def lineToTensorFillEmpty(line, max_line_size):\n",
        "    tensor = torch.zeros(max_line_size, 1, n_letters) #notice the difference between this method and the previous one\n",
        "    for li, letter in enumerate(line):\n",
        "        tensor[li][0][letterToIndex(letter)] = 1\n",
        "        \n",
        "        #Vectors with (0,0,.... ,0) are placed where there are no characters\n",
        "    return tensor\n",
        "\n",
        "def list_strings_to_list_tensors_fill_empty(names_list):\n",
        "    lines_tensors = []\n",
        "    for index, line in enumerate(names_list):\n",
        "        lineTensor = lineToTensorFillEmpty(line, max_line_size)\n",
        "        lines_tensors.append(lineTensor)\n",
        "    return(lines_tensors)\n",
        "\n",
        "lines_tensors = list_strings_to_list_tensors_fill_empty(names_list)\n",
        "\n",
        "#Let's take a look at what a word now looks like\n",
        "print(names_list[40])\n",
        "print(lines_tensors[40])\n",
        "print(lines_tensors[40].shape)"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Ololade\n",
            "tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]],\n",
            "\n",
            "        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
            "          0., 0., 0., 0., 0., 0.]]])\n",
            "torch.Size([13, 1, 57])\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EOcm7kP0agMq",
        "colab_type": "code",
        "outputId": "e5dfbc40-f518-4ca4-86f5-8bd6cf0bbd0f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "#And finally, from a list, we can create a numpy array with all our word embeddings having the same shape:\n",
        "array_lines_tensors = np.stack(lines_tensors)\n",
        "#However, such operation introduces one extra dimension (look at the dimension with index=2 having size '1')\n",
        "print(array_lines_tensors.shape)\n",
        "#Because that dimension just has size 1, we can get rid of it with the following function call\n",
        "array_lines_proper_dimension = np.squeeze(array_lines_tensors, axis=2)\n",
        "print(array_lines_proper_dimension.shape)\n",
        "\n"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(55, 13, 1, 57)\n",
            "(55, 13, 57)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0JVn6xHCaxCd",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "a80d8fd9-19f5-4d6b-f7f6-ea11e3453850"
      },
      "source": [
        "def find_start_index_per_category(category_list):\n",
        "    categories_start_index = {}\n",
        "    \n",
        "    #Initialize every category with an empty list\n",
        "    for category in all_categories:\n",
        "        categories_start_index[category] = []\n",
        "    \n",
        "    #Insert the start index of each category into the dictionary categories_start_index\n",
        "    #Example: \"Italian\" --> 203\n",
        "    #         \"Spanish\" --> 19776\n",
        "    last_category = None\n",
        "    i = 0\n",
        "    for name in names_list:\n",
        "        cur_category = category_list[i]\n",
        "        if(cur_category != last_category):\n",
        "            categories_start_index[cur_category] = i\n",
        "            last_category = cur_category\n",
        "        \n",
        "        i = i + 1\n",
        "        \n",
        "    return(categories_start_index)\n",
        "\n",
        "categories_start_index = find_start_index_per_category(category_list)\n",
        "\n",
        "print(categories_start_index)\n"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'Edo': 0, 'Urhobo': 4, 'Hausa': 23, 'Yoruba': 29, 'Igbo': 42}\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}