From 9a46c68b40b38ec28f3c87a21e520edd665fbe03 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 7 Jul 2024 20:55:18 +0200 Subject: [PATCH 01/13] make gensim optional --- flair/class_utils.py | 14 +++++++++++++- flair/embeddings/token.py | 17 ++++++++++++----- requirements.txt | 2 -- setup.py | 3 +++ 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/flair/class_utils.py b/flair/class_utils.py index 842a53387d..85d3a60d82 100644 --- a/flair/class_utils.py +++ b/flair/class_utils.py @@ -1,5 +1,6 @@ +import importlib import inspect -from typing import Iterable, Optional, Type, TypeVar +from typing import Iterable, Optional, Type, TypeVar, Any, List T = TypeVar("T") @@ -17,3 +18,14 @@ def get_state_subclass_by_name(cls: Type[T], cls_name: Optional[str]) -> Type[T] if sub_cls.__name__ == cls_name: return sub_cls raise ValueError(f"Could not find any class with name '{cls_name}'") + + +def lazy_import(group: str, module: str, *symbols: List[str]) -> List[Any]: + try: + imported_module = importlib.import_module(module) + except ImportError: + raise ImportError(f"Could not import {module}. Please install the optional '{group}' dependency. Via 'pip install flair[{group}]'") + if not symbols: + return imported_module + + return [getattr(imported_module, symbol) for symbol in symbols] diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 76173bac80..a769849e49 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -7,16 +7,14 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -import gensim import numpy as np import torch from bpemb import BPEmb from deprecated.sphinx import deprecated -from gensim.models import KeyedVectors -from gensim.models.fasttext import FastTextKeyedVectors, load_facebook_vectors from torch import nn import flair +from flair.class_utils import lazy_import from flair.data import Corpus, Dictionary, Sentence, _iter_dataset from flair.embeddings.base import TokenEmbeddings, load_embeddings, register_embeddings from flair.embeddings.transformer import ( @@ -165,6 +163,9 @@ def __init__( Constructor downloads required files if not there. + Note: + When loading a new embedding, you need to have `flair[gensim]` installed. + Args: embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or a path to a custom embedding field: if given, the word-embeddings embed the data for the specific label-type instead of the plain text. @@ -195,12 +196,13 @@ def __init__( super().__init__() if embeddings_path is not None: + KeyedVectors = lazy_import("gensim", "gensim.models", "KeyedVectors") if embeddings_path.suffix in [".bin", ".txt"]: - precomputed_word_embeddings = gensim.models.KeyedVectors.load_word2vec_format( + precomputed_word_embeddings = KeyedVectors.load_word2vec_format( str(embeddings_path), binary=embeddings_path.suffix == ".bin", no_header=no_header ) else: - precomputed_word_embeddings = gensim.models.KeyedVectors.load(str(embeddings_path)) + precomputed_word_embeddings = KeyedVectors.load(str(embeddings_path)) self.__embedding_length: int = precomputed_word_embeddings.vector_size @@ -398,6 +400,8 @@ def __setstate__(self, state: Dict[str, Any]): state.setdefault("fine_tune", False) state.setdefault("field", None) if "precomputed_word_embeddings" in state: + KeyedVectors = lazy_import("gensim", "gensim.models", "KeyedVectors") + precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings") vectors = np.vstack( ( @@ -1015,6 +1019,7 @@ def to_params(self): @register_embeddings +@deprecated(reason="The FastTextEmbeddings are no longer supported and will be removed at version 0.16.0", version="0.14.0") class FastTextEmbeddings(TokenEmbeddings): """FastText Embeddings with oov functionality.""" @@ -1048,6 +1053,8 @@ def __init__( self.static_embeddings = True + FastTextKeyedVectors, load_facebook_vectors = lazy_import("gensim", "gensim.models.fasttext", "FastTextKeyedVectors", "load_facebook_vectors") + if embeddings_path.suffix == ".bin": self.precomputed_word_embeddings: FastTextKeyedVectors = load_facebook_vectors(str(embeddings_path)) else: diff --git a/requirements.txt b/requirements.txt index fdb507e44b..08544189f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ conllu>=4.0 deprecated>=1.2.13 ftfy>=6.1.0 gdown>=4.4.0 -gensim>=4.2.0 huggingface-hub>=0.10.0 langdetect>=1.0.9 lxml>=4.8.0 @@ -23,7 +22,6 @@ torch>=1.5.0,!=1.8 tqdm>=4.63.0 transformer-smaller-training-vocab>=0.2.3 transformers[sentencepiece]>=4.18.0,<5.0.0 -urllib3<2.0.0,>=1.0.0 # pin below 2 to make dependency resolution faster. wikipedia-api>=0.5.7 semver<4.0.0,>=3.0.0 bioc<3.0.0,>=2.0.0 diff --git a/setup.py b/setup.py index 172ab7758c..4b19540e42 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,9 @@ packages=find_packages(exclude=["tests", "tests.*"]), # same as name license="MIT", install_requires=required, + extras_require={ + "gensim": ["gensim>=4.2.0"], + }, include_package_data=True, python_requires=">=3.8", ) From 5f08fbabeaff2f83f48745587a7e2cb1de45c28c Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 7 Jul 2024 21:16:09 +0200 Subject: [PATCH 02/13] fix imports --- flair/embeddings/token.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index a769849e49..5457d2de8b 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -196,7 +196,7 @@ def __init__( super().__init__() if embeddings_path is not None: - KeyedVectors = lazy_import("gensim", "gensim.models", "KeyedVectors") + KeyedVectors, = lazy_import("gensim", "gensim.models", "KeyedVectors") if embeddings_path.suffix in [".bin", ".txt"]: precomputed_word_embeddings = KeyedVectors.load_word2vec_format( str(embeddings_path), binary=embeddings_path.suffix == ".bin", no_header=no_header @@ -400,7 +400,7 @@ def __setstate__(self, state: Dict[str, Any]): state.setdefault("fine_tune", False) state.setdefault("field", None) if "precomputed_word_embeddings" in state: - KeyedVectors = lazy_import("gensim", "gensim.models", "KeyedVectors") + KeyedVectors, = lazy_import("gensim", "gensim.models", "KeyedVectors") precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings") vectors = np.vstack( From 45c32a7ca3ce8fd220cc521ce73aeff08678ca75 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 7 Jul 2024 21:52:37 +0200 Subject: [PATCH 03/13] formatting --- flair/class_utils.py | 6 ++++-- flair/embeddings/token.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/flair/class_utils.py b/flair/class_utils.py index 85d3a60d82..195838754d 100644 --- a/flair/class_utils.py +++ b/flair/class_utils.py @@ -1,6 +1,6 @@ import importlib import inspect -from typing import Iterable, Optional, Type, TypeVar, Any, List +from typing import Any, Iterable, List, Optional, Type, TypeVar T = TypeVar("T") @@ -24,7 +24,9 @@ def lazy_import(group: str, module: str, *symbols: List[str]) -> List[Any]: try: imported_module = importlib.import_module(module) except ImportError: - raise ImportError(f"Could not import {module}. Please install the optional '{group}' dependency. Via 'pip install flair[{group}]'") + raise ImportError( + f"Could not import {module}. Please install the optional '{group}' dependency. Via 'pip install flair[{group}]'" + ) if not symbols: return imported_module diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 5457d2de8b..a0f2130609 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -196,7 +196,7 @@ def __init__( super().__init__() if embeddings_path is not None: - KeyedVectors, = lazy_import("gensim", "gensim.models", "KeyedVectors") + (KeyedVectors,) = lazy_import("gensim", "gensim.models", "KeyedVectors") if embeddings_path.suffix in [".bin", ".txt"]: precomputed_word_embeddings = KeyedVectors.load_word2vec_format( str(embeddings_path), binary=embeddings_path.suffix == ".bin", no_header=no_header @@ -400,7 +400,7 @@ def __setstate__(self, state: Dict[str, Any]): state.setdefault("fine_tune", False) state.setdefault("field", None) if "precomputed_word_embeddings" in state: - KeyedVectors, = lazy_import("gensim", "gensim.models", "KeyedVectors") + (KeyedVectors,) = lazy_import("gensim", "gensim.models", "KeyedVectors") precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings") vectors = np.vstack( @@ -1019,7 +1019,9 @@ def to_params(self): @register_embeddings -@deprecated(reason="The FastTextEmbeddings are no longer supported and will be removed at version 0.16.0", version="0.14.0") +@deprecated( + reason="The FastTextEmbeddings are no longer supported and will be removed at version 0.16.0", version="0.14.0" +) class FastTextEmbeddings(TokenEmbeddings): """FastText Embeddings with oov functionality.""" @@ -1053,7 +1055,9 @@ def __init__( self.static_embeddings = True - FastTextKeyedVectors, load_facebook_vectors = lazy_import("gensim", "gensim.models.fasttext", "FastTextKeyedVectors", "load_facebook_vectors") + FastTextKeyedVectors, load_facebook_vectors = lazy_import( + "gensim", "gensim.models.fasttext", "FastTextKeyedVectors", "load_facebook_vectors" + ) if embeddings_path.suffix == ".bin": self.precomputed_word_embeddings: FastTextKeyedVectors = load_facebook_vectors(str(embeddings_path)) From f0c68af19022a200431dbd222a4893e5ff224d94 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 15:03:07 +0200 Subject: [PATCH 04/13] make bpeemb optional --- flair/embeddings/token.py | 196 ++++++++++++++++++++------------------ requirements.txt | 1 - setup.py | 2 +- 3 files changed, 105 insertions(+), 94 deletions(-) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index a0f2130609..359ded449b 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -1,6 +1,5 @@ import hashlib import logging -import os import re import tempfile from collections import Counter @@ -9,8 +8,8 @@ import numpy as np import torch -from bpemb import BPEmb from deprecated.sphinx import deprecated +from sentencepiece import SentencePieceProcessor from torch import nn import flair @@ -196,7 +195,7 @@ def __init__( super().__init__() if embeddings_path is not None: - (KeyedVectors,) = lazy_import("gensim", "gensim.models", "KeyedVectors") + (KeyedVectors,) = lazy_import("word-embeddings", "gensim.models", "KeyedVectors") if embeddings_path.suffix in [".bin", ".txt"]: precomputed_word_embeddings = KeyedVectors.load_word2vec_format( str(embeddings_path), binary=embeddings_path.suffix == ".bin", no_header=no_header @@ -220,7 +219,7 @@ def __init__( # gensim version 3 self.vocab = {k: v.index for k, v in precomputed_word_embeddings.vocab.items()} else: - # if no embedding is set, the vocab and embedding length is requried + # if no embedding is set, the vocab and embedding length is required assert vocab is not None assert embedding_length is not None self.vocab = vocab @@ -335,12 +334,6 @@ def get_cached_token_index(self, word: str) -> int: else: return len(self.vocab) # token - def get_vec(self, word: str) -> torch.Tensor: - word_embedding = self.vectors[self.get_cached_token_index(word)] - - word_embedding = torch.tensor(word_embedding.tolist(), device=flair.device, dtype=torch.float) - return word_embedding - def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: tokens = [token for sentence in sentences for token in sentence.tokens] @@ -400,7 +393,7 @@ def __setstate__(self, state: Dict[str, Any]): state.setdefault("fine_tune", False) state.setdefault("field", None) if "precomputed_word_embeddings" in state: - (KeyedVectors,) = lazy_import("gensim", "gensim.models", "KeyedVectors") + (KeyedVectors,) = lazy_import("word-embeddings", "gensim.models", "KeyedVectors") precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings") vectors = np.vstack( @@ -1056,7 +1049,7 @@ def __init__( self.static_embeddings = True FastTextKeyedVectors, load_facebook_vectors = lazy_import( - "gensim", "gensim.models.fasttext", "FastTextKeyedVectors", "load_facebook_vectors" + "word-embeddings", "gensim.models.fasttext", "FastTextKeyedVectors", "load_facebook_vectors" ) if embeddings_path.suffix == ".bin": @@ -1376,47 +1369,6 @@ def to_params(self): return {} -# TODO: keep for backwards compatibility, but remove in future -@deprecated( - reason="""'BPEmbSerializable' is only used in the legacy pickle-embeddings format. - Please save your model again to save it in the serializable json format. - """, - version="0.13.0", -) -class BPEmbSerializable(BPEmb): - """Helper class to allow pickle-seralizable BPE embeddings.""" - - def __getstate__(self): - state = self.__dict__.copy() - # save the sentence piece model as binary file (not as path which may change) - with self.model_file.open(mode="rb") as fin: - state["spm_model_binary"] = fin.read() - state["spm"] = None - return state - - def __setstate__(self, state): - from bpemb.util import sentencepiece_load - - model_file = self.model_tpl.format(lang=state["lang"], vs=state["vs"]) - self.__dict__ = state - - # write out the binary sentence piece model into the expected directory - self.cache_dir: Path = flair.cache_root / "embeddings" - if "spm_model_binary" in self.__dict__: - # if the model was saved as binary and it is not found on disk, write to appropriate path - if not os.path.exists(self.cache_dir / state["lang"]): - os.makedirs(self.cache_dir / state["lang"]) - self.model_file = self.cache_dir / model_file - with open(self.model_file, "wb") as out: - out.write(self.__dict__["spm_model_binary"]) - else: - # otherwise, use normal process and potentially trigger another download - self.model_file = self._load_file(model_file) - - # once the modes if there, load it with sentence piece - state["spm"] = sentencepiece_load(self.model_file) - - @register_embeddings class BytePairEmbeddings(TokenEmbeddings): def __init__( @@ -1428,6 +1380,7 @@ def __init__( model_file_path: Optional[Path] = None, embedding_file_path: Optional[Path] = None, name: Optional[str] = None, + force_cpu: bool = True, **kwargs, ) -> None: """Initializes BP embeddings. @@ -1438,51 +1391,98 @@ def __init__( if not cache_dir: cache_dir = flair.cache_root / "embeddings" - if language: - self.name: str = f"bpe-{language}-{syllables}-{dim}" + + if model_file_path is None and embedding_file_path is not None: + self.spm = SentencePieceProcessor() + self.spm.Load(str(model_file_path)) + vectors = np.zeros((self.spm.vocab_size() + 1, dim)) + self.name = name else: - assert ( - model_file_path is not None and embedding_file_path is not None - ), "Need to specify model_file_path and embedding_file_path if no language is given in BytePairEmbeddings(...)" - dim = None # type: ignore[assignment] - - self.embedder = BPEmb( - lang=language, - vs=syllables, - dim=dim, - cache_dir=cache_dir, - model_file=model_file_path, - emb_file=embedding_file_path, - **kwargs, - ) + if not language and model_file_path is None: + raise ValueError("Need to specify model_file_path if no language is give in BytePairEmbeddings") + BPEmb, = lazy_import("word-embeddings", "bpemb", "BPEmb") + + if language: + self.name: str = f"bpe-{language}-{syllables}-{dim}" + embedder = BPEmb( + lang=language, + vs=syllables, + dim=dim, + cache_dir=cache_dir, + model_file=model_file_path, + emb_file=embedding_file_path, + **kwargs, + ) + vectors = np.vstack( + ( + embedder.vectors, + np.zeros(embedder.dim, dtype=embedder.vectors.dtype), + ) + ) + else: + if model_file_path is None: + raise ValueError("Need to specify model_file_path if no language is give in BytePairEmbeddings") + embedder = BPEmb( + lang=language, + vs=syllables, + dim=dim, + cache_dir=cache_dir, + model_file=model_file_path, + emb_file=embedding_file_path, + **kwargs, + ) + self.spm = embedder.spm + vectors = np.vstack( + ( + embedder.vectors, + np.zeros(embedder.dim, dtype=embedder.vectors.dtype), + ) + ) + dim = embedder.dim + syllables = embedder.vs - if not language: - self.name = f"bpe-custom-{self.embedder.vs}-{self.embedder.dim}" + if not language: + self.name = f"bpe-custom-{syllables}-{dim}" if name is not None: self.name = name + self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze=True) + self.force_cpu = force_cpu self.static_embeddings = True - self.__embedding_length: int = self.embedder.emb.vector_size * 2 + self.__embedding_length: int = self.dim * 2 super().__init__() self.eval() + def _preprocess(self, text: str) -> str: + return re.sub(r"\d", "0", text) + @property def embedding_length(self) -> int: return self.__embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: - for _i, sentence in enumerate(sentences): - for token, _token_idx in zip(sentence.tokens, range(len(sentence.tokens))): - word = token.text + tokens = [token for sentence in sentences for token in sentence.tokens] - if word.strip() == "": - # empty words get no embedding - token.set_embedding(self.name, torch.zeros(self.embedding_length, dtype=torch.float)) - else: - # all other words get embedded - embeddings = self.embedder.embed(word.lower()) - embedding = np.concatenate((embeddings[0], embeddings[len(embeddings) - 1])) - token.set_embedding(self.name, torch.tensor(embedding, dtype=torch.float)) + word_indices: List[int] = [] + for token in tokens: + word = token.text if self.field is None else token.get_label(self.field).value + + if word.strip() == "": + ids = [self.embedder.spm.vocab_size(), self.embedder.spm.vocab_size()] + else: + if self.do_preproc: + word = self._preprocess(word) + ids = self.embedder.spm.EncodeAsIds(word.lower()) + ids = torch.tensor([ids[0], ids[-1]], dtype=torch.long, device=self.device) + word_indices.append(ids) + + embeddings = self.embedding(torch.tensor(word_indices, dtype=torch.long, device=self.device)) + + if self.force_cpu: + embeddings = embeddings.to(flair.device) + + for emb, token in zip(embeddings, tokens): + token.set_embedding(self.name, emb) return sentences @@ -1498,21 +1498,33 @@ def from_params(cls, params): temp_path = Path(temp_dir) model_file_path = temp_path / "model.spm" model_file_path.write_bytes(params["spm_model_binary"]) - embedding_file_path = temp_path / "word2vec.bin" - embedding_file_path.write_bytes(params["word2vec_binary"]) - return cls(name=params["name"], model_file_path=model_file_path, embedding_file_path=embedding_file_path) - def to_params(self): - if not self.embedder.emb_file.exists(): - self.embedder.emb_file = self.embedder.emb_file.with_suffix(".bin") - self.embedder.emb.save_word2vec_format(str(self.embedder.emb_file), binary=True) + if "word2vec_binary" in params: + embedding_file_path = temp_path / "word2vec.bin" + embedding_file_path.write_bytes(params["word2vec_binary"]) + dim = None + else: + embedding_file_path = None + dim = params["dim"] + return cls(name=params["name"], dim=dim, model_file_path=model_file_path, embedding_file_path=embedding_file_path) + def to_params(self): return { "name": self.name, - "spm_model_binary": self.embedder.spm.serialized_model_proto(), - "word2vec_binary": self.embedder.emb_file.read_bytes(), + "spm_model_binary": self.spm.serialized_model_proto(), + "dim": self.embedding_length // 2, } + def _apply(self, fn): + if fn.__name__ == "convert" and self.force_cpu: + # this is required to force the module on the cpu, + # if a parent module is put to gpu, the _apply is called to each sub_module + # self.to(..) actually sets the device properly + if not hasattr(self, "device"): + self.to(flair.device) + return + super()._apply(fn) + @register_embeddings class NILCEmbeddings(WordEmbeddings): diff --git a/requirements.txt b/requirements.txt index 08544189f6..2feab121c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ boto3>=1.20.27 -bpemb>=0.3.5 conllu>=4.0 deprecated>=1.2.13 ftfy>=6.1.0 diff --git a/setup.py b/setup.py index 4b19540e42..17f60733ff 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ license="MIT", install_requires=required, extras_require={ - "gensim": ["gensim>=4.2.0"], + "word-embeddings": ["gensim>=4.2.0", "bpemb>=0.3.5"], }, include_package_data=True, python_requires=">=3.8", From 1ed769003d75af2ed406795aec63c1aef07652ee Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 16:36:24 +0200 Subject: [PATCH 05/13] make bpeemb optional --- flair/embeddings/__init__.py | 1 - flair/embeddings/token.py | 82 ++++++++++++++++++++++++------------ 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/flair/embeddings/__init__.py b/flair/embeddings/__init__.py index 04e1d1376b..308acfceb3 100644 --- a/flair/embeddings/__init__.py +++ b/flair/embeddings/__init__.py @@ -40,7 +40,6 @@ # Expose token embedding classes from .token import ( - BPEmbSerializable, BytePairEmbeddings, CharacterEmbeddings, FastTextEmbeddings, diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 359ded449b..6156258f18 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -2,7 +2,7 @@ import logging import re import tempfile -from collections import Counter +from collections import Counter, Mapping from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -1381,6 +1381,8 @@ def __init__( embedding_file_path: Optional[Path] = None, name: Optional[str] = None, force_cpu: bool = True, + field: Optional[str] = None, + preprocess: bool = True, **kwargs, ) -> None: """Initializes BP embeddings. @@ -1388,11 +1390,10 @@ def __init__( Constructor downloads required files if not there. """ self.instance_parameters = self.get_instance_parameters(locals=locals()) - if not cache_dir: cache_dir = flair.cache_root / "embeddings" - if model_file_path is None and embedding_file_path is not None: + if model_file_path is not None and embedding_file_path is None: self.spm = SentencePieceProcessor() self.spm.Load(str(model_file_path)) vectors = np.zeros((self.spm.vocab_size() + 1, dim)) @@ -1413,12 +1414,6 @@ def __init__( emb_file=embedding_file_path, **kwargs, ) - vectors = np.vstack( - ( - embedder.vectors, - np.zeros(embedder.dim, dtype=embedder.vectors.dtype), - ) - ) else: if model_file_path is None: raise ValueError("Need to specify model_file_path if no language is give in BytePairEmbeddings") @@ -1431,26 +1426,28 @@ def __init__( emb_file=embedding_file_path, **kwargs, ) - self.spm = embedder.spm - vectors = np.vstack( - ( - embedder.vectors, - np.zeros(embedder.dim, dtype=embedder.vectors.dtype), - ) + self.spm = embedder.spm + vectors = np.vstack( + ( + embedder.vectors, + np.zeros(embedder.dim, dtype=embedder.vectors.dtype), ) - dim = embedder.dim - syllables = embedder.vs + ) + dim = embedder.dim + syllables = embedder.vs - if not language: - self.name = f"bpe-custom-{syllables}-{dim}" + if not language: + self.name = f"bpe-custom-{syllables}-{dim}" if name is not None: self.name = name + super().__init__() self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze=True) self.force_cpu = force_cpu self.static_embeddings = True + self.field = field + self.do_preproc = preprocess - self.__embedding_length: int = self.dim * 2 - super().__init__() + self.__embedding_length: int = dim * 2 self.eval() def _preprocess(self, text: str) -> str: @@ -1468,16 +1465,18 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: word = token.text if self.field is None else token.get_label(self.field).value if word.strip() == "": - ids = [self.embedder.spm.vocab_size(), self.embedder.spm.vocab_size()] + ids = [self.spm.vocab_size(), self.embedder.spm.vocab_size()] else: if self.do_preproc: word = self._preprocess(word) - ids = self.embedder.spm.EncodeAsIds(word.lower()) - ids = torch.tensor([ids[0], ids[-1]], dtype=torch.long, device=self.device) + ids = self.spm.EncodeAsIds(word.lower()) + ids = [ids[0], ids[-1]] word_indices.append(ids) - embeddings = self.embedding(torch.tensor(word_indices, dtype=torch.long, device=self.device)) - + breakpoint() + index_tensor = torch.tensor(word_indices, dtype=torch.long, device=self.device) + embeddings = self.embedding(index_tensor) + embeddings = embeddings.reshape((-1, self.embedding_length)) if self.force_cpu: embeddings = embeddings.to(flair.device) @@ -1506,15 +1505,23 @@ def from_params(cls, params): else: embedding_file_path = None dim = params["dim"] - return cls(name=params["name"], dim=dim, model_file_path=model_file_path, embedding_file_path=embedding_file_path) + return cls(name=params["name"], dim=dim, model_file_path=model_file_path, embedding_file_path=embedding_file_path, field=params.get("field"), preprocess=params.get("preprocess", True)) def to_params(self): return { "name": self.name, "spm_model_binary": self.spm.serialized_model_proto(), "dim": self.embedding_length // 2, + "field": self.field, + "preprocess": self.preprocess, } + def to(self, device): + if self.force_cpu: + device = torch.device("cpu") + self.device = device + super().to(device) + def _apply(self, fn): if fn.__name__ == "convert" and self.force_cpu: # this is required to force the module on the cpu, @@ -1525,6 +1532,27 @@ def _apply(self, fn): return super()._apply(fn) + def state_dict(self, *args, **kwargs): + # when loading the old versions from pickle, the embeddings might not be added as pytorch module. + # we do this delayed, when the weights are collected (e.g. for saving), as doing this earlier might + # lead to issues while loading (trying to load weights that weren't stored as python weights and therefore + # not finding them) + if list(self.modules()) == [self]: + self.embedding = self.embedding + return super().state_dict(*args, **kwargs) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + if not state_dict: + # old embeddings do not have a torch-embedding and therefore do not store the weights in the saved torch state_dict + # however they are already initialized rightfully, so we just set the state dict from our current state dict + for k, v in self.state_dict(prefix=prefix).items(): + state_dict[k] = v + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + @register_embeddings class NILCEmbeddings(WordEmbeddings): From c17dd2f8cfe65a63fd706752e53995c3c35e0c42 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 16:40:42 +0200 Subject: [PATCH 06/13] make bpeemb optional --- flair/embeddings/token.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 6156258f18..18abeccb28 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -1473,7 +1473,6 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: ids = [ids[0], ids[-1]] word_indices.append(ids) - breakpoint() index_tensor = torch.tensor(word_indices, dtype=torch.long, device=self.device) embeddings = self.embedding(index_tensor) embeddings = embeddings.reshape((-1, self.embedding_length)) From fb6004703be831aface1ee251b1b7b997a9b4f39 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 16:46:23 +0200 Subject: [PATCH 07/13] make muse warn immediately if gensim is not installed --- flair/embeddings/token.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 18abeccb28..b139f97ff3 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -2,7 +2,7 @@ import logging import re import tempfile -from collections import Counter, Mapping +from collections import Counter from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -1283,6 +1283,8 @@ def __init__( self.static_embeddings = True self.__embedding_length: int = 300 self.language_embeddings: Dict[str, Any] = {} + (KeyedVectors,) = lazy_import("word-embeddings", "gensim.models", "KeyedVectors") + self.kv = KeyedVectors super().__init__() self.eval() @@ -1345,7 +1347,7 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: embeddings_file = cached_path(f"{hu_path}/muse.{language_code}.vec.gensim", cache_dir=cache_dir) # load the model - self.language_embeddings[language_code] = gensim.models.KeyedVectors.load(str(embeddings_file)) + self.language_embeddings[language_code] = self.kv.load(str(embeddings_file)) for token, _token_idx in zip(sentence.tokens, range(len(sentence.tokens))): word_embedding = self.get_cached_vec(language_code=language_code, word=token.text) @@ -1401,7 +1403,7 @@ def __init__( else: if not language and model_file_path is None: raise ValueError("Need to specify model_file_path if no language is give in BytePairEmbeddings") - BPEmb, = lazy_import("word-embeddings", "bpemb", "BPEmb") + (BPEmb,) = lazy_import("word-embeddings", "bpemb", "BPEmb") if language: self.name: str = f"bpe-{language}-{syllables}-{dim}" @@ -1504,7 +1506,14 @@ def from_params(cls, params): else: embedding_file_path = None dim = params["dim"] - return cls(name=params["name"], dim=dim, model_file_path=model_file_path, embedding_file_path=embedding_file_path, field=params.get("field"), preprocess=params.get("preprocess", True)) + return cls( + name=params["name"], + dim=dim, + model_file_path=model_file_path, + embedding_file_path=embedding_file_path, + field=params.get("field"), + preprocess=params.get("preprocess", True), + ) def to_params(self): return { @@ -1541,7 +1550,7 @@ def state_dict(self, *args, **kwargs): return super().state_dict(*args, **kwargs) def _load_from_state_dict( - self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): if not state_dict: # old embeddings do not have a torch-embedding and therefore do not store the weights in the saved torch state_dict From c72615e42c561e33e19f7d64669ea165685798b5 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 16:55:18 +0200 Subject: [PATCH 08/13] install word-embeddings when running tests --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ee90c6421..c801b96a70 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: - name: Install Torch cpu run: pip install torch --index-url https://download.pytorch.org/whl/cpu - name: Install Flair dependencies - run: pip install -e . + run: pip install -e .[word-embeddings] - name: Install unittest dependencies run: pip install -r requirements-dev.txt - name: Show installed dependencies From ad78a7fe40bc5bc59221425d3fc2394653ed526a Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 17:21:15 +0200 Subject: [PATCH 09/13] pin conllu to 4.*.* --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2feab121c8..7f159bcd14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ boto3>=1.20.27 -conllu>=4.0 +conllu>=4.0,<5.0.0 deprecated>=1.2.13 ftfy>=6.1.0 gdown>=4.4.0 From 08117418dc49baaed97534e53f64c0533ddcfa5c Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 12 Jul 2024 20:23:41 +0200 Subject: [PATCH 10/13] put BytePairEmbeddings on right device from start --- flair/embeddings/token.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index b139f97ff3..eac263ca02 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -1451,6 +1451,7 @@ def __init__( self.__embedding_length: int = dim * 2 self.eval() + self.to(flair.device) def _preprocess(self, text: str) -> str: return re.sub(r"\d", "0", text) From 6464b4fa830372b81f51b063ce5bdf3ec676babb Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sat, 13 Jul 2024 00:14:30 +0200 Subject: [PATCH 11/13] fix saving from bytepairembeddings --- flair/embeddings/token.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index eac263ca02..644968a972 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -1522,7 +1522,7 @@ def to_params(self): "spm_model_binary": self.spm.serialized_model_proto(), "dim": self.embedding_length // 2, "field": self.field, - "preprocess": self.preprocess, + "preprocess": self.do_preproc, } def to(self, device): From e737515314b3f7144f030c9e56eda7398e75a770 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sat, 13 Jul 2024 17:18:55 +0200 Subject: [PATCH 12/13] fix typing errors --- flair/class_utils.py | 18 +++++++++++++++--- flair/embeddings/token.py | 15 ++++++++------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/flair/class_utils.py b/flair/class_utils.py index 195838754d..c17242fd3d 100644 --- a/flair/class_utils.py +++ b/flair/class_utils.py @@ -1,6 +1,7 @@ import importlib import inspect -from typing import Any, Iterable, List, Optional, Type, TypeVar +from types import ModuleType +from typing import Any, Iterable, List, Optional, Type, TypeVar, Union, overload T = TypeVar("T") @@ -20,14 +21,25 @@ def get_state_subclass_by_name(cls: Type[T], cls_name: Optional[str]) -> Type[T] raise ValueError(f"Could not find any class with name '{cls_name}'") -def lazy_import(group: str, module: str, *symbols: List[str]) -> List[Any]: +@overload +def lazy_import(group: str, module: str, first_symbol: None) -> ModuleType: + ... + + +@overload +def lazy_import(group: str, module: str, first_symbol: str, *symbols: str) -> List[Any]: + ... + + +def lazy_import(group: str, module: str, first_symbol: Optional[str] = None, *symbols: str) -> Union[List[Any], ModuleType]: try: imported_module = importlib.import_module(module) except ImportError: raise ImportError( f"Could not import {module}. Please install the optional '{group}' dependency. Via 'pip install flair[{group}]'" ) - if not symbols: + if first_symbol is None: return imported_module + symbols = (first_symbol, *symbols) return [getattr(imported_module, symbol) for symbol in symbols] diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 644968a972..ad545cefff 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -4,7 +4,7 @@ import tempfile from collections import Counter from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Tuple import numpy as np import torch @@ -393,9 +393,7 @@ def __setstate__(self, state: Dict[str, Any]): state.setdefault("fine_tune", False) state.setdefault("field", None) if "precomputed_word_embeddings" in state: - (KeyedVectors,) = lazy_import("word-embeddings", "gensim.models", "KeyedVectors") - - precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings") + precomputed_word_embeddings = state.pop("precomputed_word_embeddings") vectors = np.vstack( ( precomputed_word_embeddings.vectors, @@ -1053,7 +1051,7 @@ def __init__( ) if embeddings_path.suffix == ".bin": - self.precomputed_word_embeddings: FastTextKeyedVectors = load_facebook_vectors(str(embeddings_path)) + self.precomputed_word_embeddings = load_facebook_vectors(str(embeddings_path)) else: self.precomputed_word_embeddings = FastTextKeyedVectors.load(str(embeddings_path)) @@ -1399,7 +1397,10 @@ def __init__( self.spm = SentencePieceProcessor() self.spm.Load(str(model_file_path)) vectors = np.zeros((self.spm.vocab_size() + 1, dim)) - self.name = name + if name is not None: + self.name = name + else: + raise ValueError("When only providing a SentencePieceProcessor, you need to specify a name for the BytePairEmbeddings") else: if not language and model_file_path is None: raise ValueError("Need to specify model_file_path if no language is give in BytePairEmbeddings") @@ -1463,7 +1464,7 @@ def embedding_length(self) -> int: def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: tokens = [token for sentence in sentences for token in sentence.tokens] - word_indices: List[int] = [] + word_indices: List[List[int]] = [] for token in tokens: word = token.text if self.field is None else token.get_label(self.field).value From a7008ed1f190553eaa11e1d60548c154b57ae66c Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sat, 13 Jul 2024 19:34:58 +0200 Subject: [PATCH 13/13] code formatting --- flair/class_utils.py | 10 +++++----- flair/embeddings/token.py | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/flair/class_utils.py b/flair/class_utils.py index c17242fd3d..9aa95cd1ee 100644 --- a/flair/class_utils.py +++ b/flair/class_utils.py @@ -22,16 +22,16 @@ def get_state_subclass_by_name(cls: Type[T], cls_name: Optional[str]) -> Type[T] @overload -def lazy_import(group: str, module: str, first_symbol: None) -> ModuleType: - ... +def lazy_import(group: str, module: str, first_symbol: None) -> ModuleType: ... @overload -def lazy_import(group: str, module: str, first_symbol: str, *symbols: str) -> List[Any]: - ... +def lazy_import(group: str, module: str, first_symbol: str, *symbols: str) -> List[Any]: ... -def lazy_import(group: str, module: str, first_symbol: Optional[str] = None, *symbols: str) -> Union[List[Any], ModuleType]: +def lazy_import( + group: str, module: str, first_symbol: Optional[str] = None, *symbols: str +) -> Union[List[Any], ModuleType]: try: imported_module = importlib.import_module(module) except ImportError: diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index ad545cefff..868c4175ff 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -4,7 +4,7 @@ import tempfile from collections import Counter from pathlib import Path -from typing import Any, Dict, List, Optional, Union, Tuple +from typing import Any, Dict, List, Optional, Union import numpy as np import torch @@ -1400,7 +1400,9 @@ def __init__( if name is not None: self.name = name else: - raise ValueError("When only providing a SentencePieceProcessor, you need to specify a name for the BytePairEmbeddings") + raise ValueError( + "When only providing a SentencePieceProcessor, you need to specify a name for the BytePairEmbeddings" + ) else: if not language and model_file_path is None: raise ValueError("Need to specify model_file_path if no language is give in BytePairEmbeddings")