Skip to content

Commit

Permalink
Host our own copy of GloVe
Browse files Browse the repository at this point in the history
  • Loading branch information
jre21 committed Jun 29, 2022
1 parent 19b2f14 commit 60f274e
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 25 deletions.
2 changes: 1 addition & 1 deletion mindmeld/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def register_func(self, name=None):
def _decorator(func):
func_name = name or func.__name__
if not callable(func):
raise TypeError("Invalid function type %s.", func_name) # pylint: disable=W0715
raise TypeError(f"Invalid function type {func_name}.")
self.registry.functions_registry[func_name] = func

return _decorator
Expand Down
2 changes: 1 addition & 1 deletion mindmeld/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ def num_parser(ctx, start, port):
DUCKLING_VERSION,
os.path.basename(exec_path),
]
url = os.path.join(*url_components)
url = "/".join(url_components)
logger.info(
"Could not find %s binary file, downloading from %s", exec_path, url
)
Expand Down
9 changes: 9 additions & 0 deletions mindmeld/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@
BINARIES_URL = "https://binaries.mindmeld.com"
DUCKLING_VERSION = "20211005"

EMBEDDINGS_VERSION = "6B"
EMBEDDINGS_FILE = f"glove.{EMBEDDINGS_VERSION}.zip"
EMBEDDINGS_URL = '/'.join([
BINARIES_URL,
"glove",
EMBEDDINGS_VERSION,
EMBEDDINGS_FILE,
])


# ACTIVE LEARNING CONSTANTS
class TuneLevel(Enum):
Expand Down
29 changes: 8 additions & 21 deletions mindmeld/models/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@
from tqdm import tqdm

from ._util import _is_module_available, _get_module_or_attr as _getattr
from ..constants import EMBEDDINGS_URL
from ..core import Bunch
from ..exceptions import EmbeddingDownloadError
from ..path import (
EMBEDDINGS_FILE_PATH,
EMBEDDINGS_FOLDER_PATH,
)
from ..path import EMBEDDINGS_FILE_PATH, EMBEDDINGS_FOLDER_PATH
from ..resource_loader import Hasher

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -61,7 +59,6 @@ class GloVeEmbeddingsContainer:
"""
CONTAINER_LOOKUP = {}

GLOVE_DOWNLOAD_LINK = "http://nlp.stanford.edu/data/glove.6B.zip"
EMBEDDING_FILE_PATH_TEMPLATE = "glove.6B.{}d.txt"
ALLOWED_WORD_EMBEDDING_DIMENSIONS = [50, 100, 200, 300]

Expand Down Expand Up @@ -96,34 +93,24 @@ def get_pretrained_word_to_embeddings_dict(self):

def _download_embeddings_and_return_zip_handle(self):

logger.info("Downloading embedding from %s", GloVeEmbeddingsContainer.GLOVE_DOWNLOAD_LINK)
logger.info("Downloading embedding from %s", EMBEDDINGS_URL)

# Make the folder that will contain the embeddings
if not os.path.exists(EMBEDDINGS_FOLDER_PATH):
os.makedirs(EMBEDDINGS_FOLDER_PATH)

with TqdmUpTo(
unit="B", unit_scale=True, miniters=1, desc=GloVeEmbeddingsContainer.GLOVE_DOWNLOAD_LINK
) as t:
with TqdmUpTo(unit="B", unit_scale=True, miniters=1, desc=EMBEDDINGS_URL) as t:

try:
urlretrieve(
GloVeEmbeddingsContainer.GLOVE_DOWNLOAD_LINK, EMBEDDINGS_FILE_PATH,
reporthook=t.update_to
)
urlretrieve(EMBEDDINGS_URL, EMBEDDINGS_FILE_PATH, reporthook=t.update_to)

except ConnectionError as e:
logger.error(
"There was an issue downloading from this "
"link %s with the following error: "
"%s",
GloVeEmbeddingsContainer.GLOVE_DOWNLOAD_LINK,
e,
)
logger.error("Error downloading from %s: %s", EMBEDDINGS_URL, e)
return

file_name = GloVeEmbeddingsContainer.EMBEDDING_FILE_PATH_TEMPLATE.format(
self.token_dimension)
self.token_dimension
)
zip_file_object = zipfile.ZipFile(EMBEDDINGS_FILE_PATH, "r")

if file_name not in zip_file_object.namelist():
Expand Down
1 change: 0 additions & 1 deletion mindmeld/models/taggers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

logger = logging.getLogger(__name__)

GLOVE_DOWNLOAD_LINK = "http://nlp.stanford.edu/data/glove.6B.zip"
EMBEDDING_FILE_PATH_TEMPLATE = "glove.6B.{}d.txt"
ALLOWED_WORD_EMBEDDING_DIMENSIONS = [50, 100, 200, 300]

Expand Down
3 changes: 2 additions & 1 deletion mindmeld/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from functools import wraps
from importlib.machinery import SourceFileLoader

from .constants import EMBEDDINGS_FILE
from .exceptions import MindMeldImportError

MINDMELD_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand Down Expand Up @@ -129,7 +130,7 @@
}

EMBEDDINGS_FOLDER_PATH = os.path.join(MINDMELD_ROOT, "data")
EMBEDDINGS_FILE_PATH = os.path.join(EMBEDDINGS_FOLDER_PATH, "glove.6B.zip")
EMBEDDINGS_FILE_PATH = os.path.join(EMBEDDINGS_FOLDER_PATH, EMBEDDINGS_FILE)
PREVIOUSLY_USED_CHAR_EMBEDDINGS_FILE_PATH = os.path.join(
EMBEDDINGS_FOLDER_PATH, "previously_used_char_embeddings.pkl"
)
Expand Down

0 comments on commit 60f274e

Please sign in to comment.