Fix upper limit of nb_elements (#2067)

* upper limit nb_elements fix * upper limit nb_elements fix tests
joke2k · Jun 26, 2024 · b01507a · b01507a
1 parent 97c046c
commit b01507a
Show file tree

Hide file tree

Showing 7 changed files with 146 additions and 61 deletions.
diff --git a/faker/providers/lorem/__init__.py b/faker/providers/lorem/__init__.py
@@ -23,40 +23,33 @@ class Provider(BaseProvider):
  word_connector = " "
  sentence_punctuation = "."
 
- def words(
+ def get_words_list(
  self,
- nb: int = 3,
  part_of_speech: Optional[str] = None,
  ext_word_list: Optional[Sequence[str]] = None,
- unique: bool = False,
  ) -> List[str]:
- """Generate a tuple of words.
-
- The ``nb`` argument controls the number of words in the resulting list,
- and if ``ext_word_list`` is provided, words from that list will be used
- instead of those from the locale provider's built-in word list.
+ """Get list of words.
 
- If ``unique`` is ``True``, this method will return a list containing
- unique words. Under the hood, |random_sample| will be used for sampling
- without replacement. If ``unique`` is ``False``, |random_choices| is
- used instead, and the list returned may contain duplicates.
+ ``ext_word_list`` is a parameter that allows the user to provide a list
+ of words to be used instead of the built-in word list. If ``ext_word_list``
+ is provided, then the value of ``part_of_speech`` is ignored.
 
  ``part_of_speech`` is a parameter that defines to what part of speech
  the returned word belongs. If ``ext_word_list`` is not ``None``, then
  ``part_of_speech`` is ignored. If the value of ``part_of_speech`` does
  not correspond to an existent part of speech according to the set locale,
  then an exception is raised.
 
- .. warning::
- Depending on the length of a locale provider's built-in word list or
- on the length of ``ext_word_list`` if provided, a large ``nb`` can
- exhaust said lists if ``unique`` is ``True``, raising an exception.
+ :sample: part_of_speech="abc", ext_word_list=['abc', 'def', 'ghi', 'jkl']
+ :sample: part_of_speech="abc"
+ :sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
 
- :sample:
- :sample: nb=5
- :sample: nb=5, ext_word_list=['abc', 'def', 'ghi', 'jkl']
- :sample: nb=4, ext_word_list=['abc', 'def', 'ghi', 'jkl'], unique=True
+ .. warning::
+  Depending on the length of a locale provider's built-in word list or
+  on the length of ``ext_word_list`` if provided, a large ``nb`` can
+  exhaust said lists if ``unique`` is ``True``, raising an exception.
  """
+
  if ext_word_list is not None:
  word_list = ext_word_list
  elif part_of_speech:
@@ -67,6 +60,38 @@ def words(
  else:
  word_list = self.word_list # type: ignore[attr-defined]
 
+ return word_list
+
+ def words(
+ self,
+ nb: int = 3,
+ word_list: List[str] = None,
+ unique: bool = False,
+ ) -> List[str]:
+ """Generate a tuple of words.
+
+ The ``nb`` argument controls the number of words in the resulting list,
+ and if ``ext_word_list`` is provided, words from that list will be used
+ instead of those from the locale provider's built-in word list.
+
+ if ``word_list`` is not provided, the method will use a default value of None,
+ which will result in the method calling the ``get_words_list`` method to get the
+ word list. If ``word_list`` is provided, the method will use the provided list.
+
+ If ``unique`` is ``True``, this method will return a list containing
+ unique words. Under the hood, |random_sample| will be used for sampling
+ without replacement. If ``unique`` is ``False``, |random_choices| is
+ used instead, and the list returned may contain duplicates.
+
+ :sample:
+ :sample: nb=5
+ :sample: nb=5, ext_word_list=['abc', 'def', 'ghi', 'jkl']
+ :sample: nb=4, ext_word_list=['abc', 'def', 'ghi', 'jkl'], unique=True
+ """
+
+ if word_list is None:
+ word_list = self.get_words_list()
+
  if unique:
  unique_samples = cast(List[str], self.random_sample(word_list, length=nb))
  return unique_samples
@@ -82,7 +107,9 @@ def word(self, part_of_speech: Optional[str] = None, ext_word_list: Optional[Seq
  :sample:
  :sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
  """
- return self.words(1, part_of_speech, ext_word_list)[0]
+ word_list = self.get_words_list(part_of_speech, ext_word_list)
+
+ return self.words(1, word_list)[0]
 
  def sentence(
  self, nb_words: int = 6, variable_nb_words: bool = True, ext_word_list: Optional[Sequence[str]] = None
@@ -109,7 +136,8 @@ def sentence(
  if variable_nb_words:
  nb_words = self.randomize_nb_elements(nb_words, min=1)
 
- words = list(self.words(nb=nb_words, ext_word_list=ext_word_list))
+ word_list = self.get_words_list(ext_word_list=ext_word_list)
+ words = list(self.words(nb=nb_words, word_list=word_list))
  words[0] = words[0].title()
 
  return self.word_connector.join(words) + self.sentence_punctuation

diff --git a/faker/providers/lorem/en_PH/__init__.py b/faker/providers/lorem/en_PH/__init__.py
@@ -32,7 +32,10 @@ def english_words(self, nb: int = 3, unique: bool = False) -> List[str]:
  :sample: nb=5
  :sample: nb=5, unique=True
  """
- return self.words(nb=nb, ext_word_list=self.english_word_list, unique=unique)
+
+ word_list = self.generator.get_words_list(ext_word_list=self.english_word_list)
+
+ return self.words(nb=nb, word_list=word_list, unique=unique)
 
  def english_sentence(self, nb_words: int = 6, variable_nb_words: bool = True) -> str:
  """Generate a sentence in English.

diff --git a/faker/providers/python/__init__.py b/faker/providers/python/__init__.py
@@ -1,3 +1,4 @@
+import logging
 import math
 import string
 import sys
@@ -16,6 +17,8 @@
 TypesSpec = Union[List[Type], Tuple[Type, ...]]
 TEnum = TypeVar("TEnum", bound=Enum)
 
+logger = logging.getLogger(__name__)
+
 
 class EmptyEnumException(BaseFakerException):
  pass
@@ -466,9 +469,18 @@ def pydict(
  :variable_nb_elements: is use variable number of elements for dictionary
  :value_types: type of dictionary values
  """
+
+ words_list_count = len(self.generator.get_words_list())
+
  if variable_nb_elements:
  nb_elements = self.randomize_nb_elements(nb_elements, min=1)
 
+ if nb_elements > words_list_count:
+ logger.warning(
+ f"Number of nb_elements is greater than the number of words in the list. {words_list_count} words will be used."
+ )
+ nb_elements = words_list_count
+
  return dict(
  zip(
  self.generator.words(nb_elements, unique=True),

diff --git a/faker/proxy.pyi b/faker/proxy.pyi
@@ -1588,6 +1588,32 @@ class Faker:
  def isbn10(self, separator: str = ...) -> str: ...
  def isbn13(self, separator: str = ...) -> str: ...
  def job(self) -> str: ...
+ def get_words_list(
+ self, part_of_speech: Optional[str] = ..., ext_word_list: Optional[Sequence[str]] = ...
+ ) -> List[str]:
+ """
+ Get list of words.
+
+ ``ext_word_list`` is a parameter that allows the user to provide a list
+ of words to be used instead of the built-in word list. If ``ext_word_list``
+ is provided, then the value of ``part_of_speech`` is ignored.
+
+ ``part_of_speech`` is a parameter that defines to what part of speech
+ the returned word belongs. If ``ext_word_list`` is not ``None``, then
+ ``part_of_speech`` is ignored. If the value of ``part_of_speech`` does
+ not correspond to an existent part of speech according to the set locale,
+ then an exception is raised.
+
+ :sample: part_of_speech="abc", ext_word_list=['abc', 'def', 'ghi', 'jkl']
+ :sample: part_of_speech="abc"
+ :sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
+
+ .. warning::
+ Depending on the length of a locale provider's built-in word list or
+ on the length of ``ext_word_list`` if provided, a large ``nb`` can
+ exhaust said lists if ``unique`` is ``True``, raising an exception.
+ """
+ ...
  def paragraph(
  self, nb_sentences: int = ..., variable_nb_sentences: bool = ..., ext_word_list: Optional[Sequence[str]] = ...
  ) -> str:
@@ -1703,36 +1729,23 @@ class Faker:
  :sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
  """
  ...
- def words(
- self,
- nb: int = ...,
- part_of_speech: Optional[str] = ...,
- ext_word_list: Optional[Sequence[str]] = ...,
- unique: bool = ...,
- ) -> List[str]:
+ def words(self, nb: int = ..., word_list: List[str] = ..., unique: bool = ...) -> List[str]:
  """
  Generate a tuple of words.
 
  The ``nb`` argument controls the number of words in the resulting list,
  and if ``ext_word_list`` is provided, words from that list will be used
  instead of those from the locale provider's built-in word list.
 
+ if ``word_list`` is not provided, the method will use a default value of None,
+ which will result in the method calling the ``get_words_list`` method to get the
+ word list. If ``word_list`` is provided, the method will use the provided list.
+
  If ``unique`` is ``True``, this method will return a list containing
  unique words. Under the hood, |random_sample| will be used for sampling
  without replacement. If ``unique`` is ``False``, |random_choices| is
  used instead, and the list returned may contain duplicates.
 
- ``part_of_speech`` is a parameter that defines to what part of speech
- the returned word belongs. If ``ext_word_list`` is not ``None``, then
- ``part_of_speech`` is ignored. If the value of ``part_of_speech`` does
- not correspond to an existent part of speech according to the set locale,
- then an exception is raised.
-
- .. warning::
- Depending on the length of a locale provider's built-in word list or
- on the length of ``ext_word_list`` if provided, a large ``nb`` can
- exhaust said lists if ``unique`` is ``True``, raising an exception.
-
  :sample:
  :sample: nb=5
  :sample: nb=5, ext_word_list=['abc', 'def', 'ghi', 'jkl']

diff --git a/tests/providers/test_lorem.py b/tests/providers/test_lorem.py
@@ -44,15 +44,15 @@ def test_words_with_defaults(self, faker, num_samples):
  def test_words_with_custom_word_list(self, faker, num_samples):
  num_words = 5
  for _ in range(num_samples):
- words = faker.words(num_words, ext_word_list=self.custom_word_list)
+ words = faker.words(num_words, word_list=self.custom_word_list)
  assert isinstance(words, list)
  assert len(words) == 5
  assert all(isinstance(word, str) and word in self.custom_word_list for word in words)
 
  def test_words_with_unique_sampling(self, faker, num_samples):
  num_words = 5
  for _ in range(num_samples):
- words = faker.words(num_words, ext_word_list=self.custom_word_list, unique=True)
+ words = faker.words(num_words, word_list=self.custom_word_list, unique=True)
  assert isinstance(words, list)
  assert len(words) == 5
 
@@ -165,29 +165,26 @@ def test_texts(self, faker, num_samples):
  words = re.sub(r"[.\n]+", " ", text.lower()).split()
  assert all(word in self.custom_word_list for word in words)
 
- @pytest.mark.parametrize(
- "nb,part_of_speech", [(10, "verb"), (18, "adverb"), (11, "noun")], ids=["verb", "adverb", "noun"]
- )
- def test_words_part_of_speech(self, faker, nb, part_of_speech):
- words = faker.words(nb=nb, part_of_speech=part_of_speech)
- assert (word in EnUsLoremProvider.parts_of_speech[part_of_speech] for word in words)
+ def test_get_default_words_list(self, faker):
+ words_list = faker.get_words_list()
+ assert all(word in EnUsLoremProvider.word_list for word in words_list)
+
+ @pytest.mark.parametrize("part_of_speech", [("verb"), ("adverb"), ("noun")], ids=["verb", "adverb", "noun"])
+ def test_get_words_list_part_of_speech(self, faker, part_of_speech):
+ words_list = faker.get_words_list(part_of_speech=part_of_speech)
+ assert (word in EnUsLoremProvider.parts_of_speech[part_of_speech] for word in words_list)
+
+ def test_get_words_list_invalid_part_of_speech(self, faker):
+ part_of_speech = "invalid part of speech"
 
- @pytest.mark.parametrize("nb,part_of_speech", [(5, "abcdefg")], ids=["invalid part of speech"])
- def test_words_invalid_part_of_speech(self, faker, nb, part_of_speech):
  with pytest.raises(ValueError) as exc_info:
- faker.words(nb=nb, part_of_speech=part_of_speech)
+ faker.get_words_list(part_of_speech=part_of_speech)
 
  assert exc_info.type is ValueError
  assert exc_info.value.args[0] == f"{part_of_speech} is not recognized as a part of speech."
 
- @pytest.mark.parametrize(
- "nb,part_of_speech",
- [(3, "adverb"), (5, "verb"), (4, "abcdefgh")],
- ids=["ignore adverb", "ignore verb", "ignore invalid part of speech"],
- )
- def test_words_part_of_speech_ignored(self, faker, nb, part_of_speech):
- words = faker.words(nb=nb, part_of_speech=part_of_speech, ext_word_list=self.custom_word_list)
- assert len(words) == nb
+ def test_get_words_list_part_of_speech_ignored(self, faker):
+ words = faker.get_words_list(part_of_speech="ignored part of speech", ext_word_list=self.custom_word_list)
  assert all(word in self.custom_word_list for word in words)
 
 

diff --git a/tests/providers/test_python.py b/tests/providers/test_python.py
@@ -1,4 +1,5 @@
 import decimal
+import logging
 import sys
 import unittest
 import warnings
@@ -299,6 +300,37 @@ def test_float_min_and_max_value_with_same_whole(self):
  self.fake.pyfloat(min_value=2.3, max_value=2.5)
 
 
+class TestPyDict(unittest.TestCase):
+ def setUp(self):
+ self.fake = Faker()
+ Faker.seed(0)
+
+ def test_pydict_with_default_nb_elements(self):
+ result = self.fake.pydict()
+
+ self.assertEqual(len(result), 10)
+
+ def test_pydict_with_valid_number_of_nb_elements(self):
+ result = self.fake.pydict(nb_elements=5)
+
+ self.assertEqual(len(result), 5)
+
+ def test_pydict_with_invalid_number_of_nb_elements(self):
+ nb_elements = 10000
+
+ words_list_count = len(self.fake.get_words_list())
+
+ logger = logging.getLogger("faker.providers.python")
+
+ with patch.object(logger, "warning") as mock_warn:
+ result = self.fake.pydict(nb_elements=nb_elements)
+
+ mock_warn.assert_called_once_with(
+ f"Number of nb_elements is greater than the number of words in the list. {words_list_count} words will be used."
+ )
+ self.assertEqual(len(result), words_list_count)
+
+
 class TestPydecimal(unittest.TestCase):
  def setUp(self):
  self.fake = Faker()

diff --git a/tests/pytest/session_overrides/session_locale/__init__.py b/tests/pytest/session_overrides/session_locale/__init__.py
@@ -1 +1 @@
-_MODULE_LOCALES = ["en_GB"]
+_MODULE_LOCALES = ["en_US"]