Skip to content

Commit

Permalink
Fix upper limit of nb_elements (#2067)
Browse files Browse the repository at this point in the history
* upper limit nb_elements fix

* upper limit nb_elements fix tests
  • Loading branch information
mileswatsonbjss committed Jun 26, 2024
1 parent 97c046c commit b01507a
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 61 deletions.
72 changes: 50 additions & 22 deletions faker/providers/lorem/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,40 +23,33 @@ class Provider(BaseProvider):
word_connector = " "
sentence_punctuation = "."

def words(
def get_words_list(
self,
nb: int = 3,
part_of_speech: Optional[str] = None,
ext_word_list: Optional[Sequence[str]] = None,
unique: bool = False,
) -> List[str]:
"""Generate a tuple of words.
The ``nb`` argument controls the number of words in the resulting list,
and if ``ext_word_list`` is provided, words from that list will be used
instead of those from the locale provider's built-in word list.
"""Get list of words.
If ``unique`` is ``True``, this method will return a list containing
unique words. Under the hood, |random_sample| will be used for sampling
without replacement. If ``unique`` is ``False``, |random_choices| is
used instead, and the list returned may contain duplicates.
``ext_word_list`` is a parameter that allows the user to provide a list
of words to be used instead of the built-in word list. If ``ext_word_list``
is provided, then the value of ``part_of_speech`` is ignored.
``part_of_speech`` is a parameter that defines to what part of speech
the returned word belongs. If ``ext_word_list`` is not ``None``, then
``part_of_speech`` is ignored. If the value of ``part_of_speech`` does
not correspond to an existent part of speech according to the set locale,
then an exception is raised.
.. warning::
Depending on the length of a locale provider's built-in word list or
on the length of ``ext_word_list`` if provided, a large ``nb`` can
exhaust said lists if ``unique`` is ``True``, raising an exception.
:sample: part_of_speech="abc", ext_word_list=['abc', 'def', 'ghi', 'jkl']
:sample: part_of_speech="abc"
:sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
:sample:
:sample: nb=5
:sample: nb=5, ext_word_list=['abc', 'def', 'ghi', 'jkl']
:sample: nb=4, ext_word_list=['abc', 'def', 'ghi', 'jkl'], unique=True
.. warning::
Depending on the length of a locale provider's built-in word list or
on the length of ``ext_word_list`` if provided, a large ``nb`` can
exhaust said lists if ``unique`` is ``True``, raising an exception.
"""

if ext_word_list is not None:
word_list = ext_word_list
elif part_of_speech:
Expand All @@ -67,6 +60,38 @@ def words(
else:
word_list = self.word_list # type: ignore[attr-defined]

return word_list

Check failure on line 63 in faker/providers/lorem/__init__.py

View workflow job for this annotation

GitHub Actions / typing (3.8)

Incompatible return value type

Check failure on line 63 in faker/providers/lorem/__init__.py

View workflow job for this annotation

GitHub Actions / typing (3.9)

Incompatible return value type

Check failure on line 63 in faker/providers/lorem/__init__.py

View workflow job for this annotation

GitHub Actions / typing (3.10)

Incompatible return value type

def words(
self,
nb: int = 3,
word_list: List[str] = None,

Check failure on line 68 in faker/providers/lorem/__init__.py

View workflow job for this annotation

GitHub Actions / typing (3.8)

Incompatible default for

Check failure on line 68 in faker/providers/lorem/__init__.py

View workflow job for this annotation

GitHub Actions / typing (3.9)

Incompatible default for

Check failure on line 68 in faker/providers/lorem/__init__.py

View workflow job for this annotation

GitHub Actions / typing (3.10)

Incompatible default for
unique: bool = False,
) -> List[str]:
"""Generate a tuple of words.
The ``nb`` argument controls the number of words in the resulting list,
and if ``ext_word_list`` is provided, words from that list will be used
instead of those from the locale provider's built-in word list.
if ``word_list`` is not provided, the method will use a default value of None,
which will result in the method calling the ``get_words_list`` method to get the
word list. If ``word_list`` is provided, the method will use the provided list.
If ``unique`` is ``True``, this method will return a list containing
unique words. Under the hood, |random_sample| will be used for sampling
without replacement. If ``unique`` is ``False``, |random_choices| is
used instead, and the list returned may contain duplicates.
:sample:
:sample: nb=5
:sample: nb=5, ext_word_list=['abc', 'def', 'ghi', 'jkl']
:sample: nb=4, ext_word_list=['abc', 'def', 'ghi', 'jkl'], unique=True
"""

if word_list is None:
word_list = self.get_words_list()

if unique:
unique_samples = cast(List[str], self.random_sample(word_list, length=nb))
return unique_samples
Expand All @@ -82,7 +107,9 @@ def word(self, part_of_speech: Optional[str] = None, ext_word_list: Optional[Seq
:sample:
:sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
"""
return self.words(1, part_of_speech, ext_word_list)[0]
word_list = self.get_words_list(part_of_speech, ext_word_list)

return self.words(1, word_list)[0]

def sentence(
self, nb_words: int = 6, variable_nb_words: bool = True, ext_word_list: Optional[Sequence[str]] = None
Expand All @@ -109,7 +136,8 @@ def sentence(
if variable_nb_words:
nb_words = self.randomize_nb_elements(nb_words, min=1)

words = list(self.words(nb=nb_words, ext_word_list=ext_word_list))
word_list = self.get_words_list(ext_word_list=ext_word_list)
words = list(self.words(nb=nb_words, word_list=word_list))
words[0] = words[0].title()

return self.word_connector.join(words) + self.sentence_punctuation
Expand Down
5 changes: 4 additions & 1 deletion faker/providers/lorem/en_PH/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def english_words(self, nb: int = 3, unique: bool = False) -> List[str]:
:sample: nb=5
:sample: nb=5, unique=True
"""
return self.words(nb=nb, ext_word_list=self.english_word_list, unique=unique)

word_list = self.generator.get_words_list(ext_word_list=self.english_word_list)

return self.words(nb=nb, word_list=word_list, unique=unique)

def english_sentence(self, nb_words: int = 6, variable_nb_words: bool = True) -> str:
"""Generate a sentence in English.
Expand Down
12 changes: 12 additions & 0 deletions faker/providers/python/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import math
import string
import sys
Expand All @@ -16,6 +17,8 @@
TypesSpec = Union[List[Type], Tuple[Type, ...]]
TEnum = TypeVar("TEnum", bound=Enum)

logger = logging.getLogger(__name__)


class EmptyEnumException(BaseFakerException):
pass
Expand Down Expand Up @@ -466,9 +469,18 @@ def pydict(
:variable_nb_elements: is use variable number of elements for dictionary
:value_types: type of dictionary values
"""

words_list_count = len(self.generator.get_words_list())

if variable_nb_elements:
nb_elements = self.randomize_nb_elements(nb_elements, min=1)

if nb_elements > words_list_count:
logger.warning(
f"Number of nb_elements is greater than the number of words in the list. {words_list_count} words will be used."

Check failure on line 480 in faker/providers/python/__init__.py

View workflow job for this annotation

GitHub Actions / flake8

line too long (128 > 120 characters)
)
nb_elements = words_list_count

return dict(
zip(
self.generator.words(nb_elements, unique=True),
Expand Down
49 changes: 31 additions & 18 deletions faker/proxy.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1588,6 +1588,32 @@ class Faker:
def isbn10(self, separator: str = ...) -> str: ...
def isbn13(self, separator: str = ...) -> str: ...
def job(self) -> str: ...
def get_words_list(
self, part_of_speech: Optional[str] = ..., ext_word_list: Optional[Sequence[str]] = ...
) -> List[str]:
"""
Get list of words.
``ext_word_list`` is a parameter that allows the user to provide a list
of words to be used instead of the built-in word list. If ``ext_word_list``
is provided, then the value of ``part_of_speech`` is ignored.
``part_of_speech`` is a parameter that defines to what part of speech
the returned word belongs. If ``ext_word_list`` is not ``None``, then
``part_of_speech`` is ignored. If the value of ``part_of_speech`` does
not correspond to an existent part of speech according to the set locale,
then an exception is raised.
:sample: part_of_speech="abc", ext_word_list=['abc', 'def', 'ghi', 'jkl']
:sample: part_of_speech="abc"
:sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
.. warning::
Depending on the length of a locale provider's built-in word list or
on the length of ``ext_word_list`` if provided, a large ``nb`` can
exhaust said lists if ``unique`` is ``True``, raising an exception.
"""
...
def paragraph(
self, nb_sentences: int = ..., variable_nb_sentences: bool = ..., ext_word_list: Optional[Sequence[str]] = ...
) -> str:
Expand Down Expand Up @@ -1703,36 +1729,23 @@ class Faker:
:sample: ext_word_list=['abc', 'def', 'ghi', 'jkl']
"""
...
def words(
self,
nb: int = ...,
part_of_speech: Optional[str] = ...,
ext_word_list: Optional[Sequence[str]] = ...,
unique: bool = ...,
) -> List[str]:
def words(self, nb: int = ..., word_list: List[str] = ..., unique: bool = ...) -> List[str]:
"""
Generate a tuple of words.
The ``nb`` argument controls the number of words in the resulting list,
and if ``ext_word_list`` is provided, words from that list will be used
instead of those from the locale provider's built-in word list.
if ``word_list`` is not provided, the method will use a default value of None,
which will result in the method calling the ``get_words_list`` method to get the
word list. If ``word_list`` is provided, the method will use the provided list.
If ``unique`` is ``True``, this method will return a list containing
unique words. Under the hood, |random_sample| will be used for sampling
without replacement. If ``unique`` is ``False``, |random_choices| is
used instead, and the list returned may contain duplicates.
``part_of_speech`` is a parameter that defines to what part of speech
the returned word belongs. If ``ext_word_list`` is not ``None``, then
``part_of_speech`` is ignored. If the value of ``part_of_speech`` does
not correspond to an existent part of speech according to the set locale,
then an exception is raised.
.. warning::
Depending on the length of a locale provider's built-in word list or
on the length of ``ext_word_list`` if provided, a large ``nb`` can
exhaust said lists if ``unique`` is ``True``, raising an exception.
:sample:
:sample: nb=5
:sample: nb=5, ext_word_list=['abc', 'def', 'ghi', 'jkl']
Expand Down
35 changes: 16 additions & 19 deletions tests/providers/test_lorem.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ def test_words_with_defaults(self, faker, num_samples):
def test_words_with_custom_word_list(self, faker, num_samples):
num_words = 5
for _ in range(num_samples):
words = faker.words(num_words, ext_word_list=self.custom_word_list)
words = faker.words(num_words, word_list=self.custom_word_list)
assert isinstance(words, list)
assert len(words) == 5
assert all(isinstance(word, str) and word in self.custom_word_list for word in words)

def test_words_with_unique_sampling(self, faker, num_samples):
num_words = 5
for _ in range(num_samples):
words = faker.words(num_words, ext_word_list=self.custom_word_list, unique=True)
words = faker.words(num_words, word_list=self.custom_word_list, unique=True)
assert isinstance(words, list)
assert len(words) == 5

Expand Down Expand Up @@ -165,29 +165,26 @@ def test_texts(self, faker, num_samples):
words = re.sub(r"[.\n]+", " ", text.lower()).split()
assert all(word in self.custom_word_list for word in words)

@pytest.mark.parametrize(
"nb,part_of_speech", [(10, "verb"), (18, "adverb"), (11, "noun")], ids=["verb", "adverb", "noun"]
)
def test_words_part_of_speech(self, faker, nb, part_of_speech):
words = faker.words(nb=nb, part_of_speech=part_of_speech)
assert (word in EnUsLoremProvider.parts_of_speech[part_of_speech] for word in words)
def test_get_default_words_list(self, faker):
words_list = faker.get_words_list()
assert all(word in EnUsLoremProvider.word_list for word in words_list)

@pytest.mark.parametrize("part_of_speech", [("verb"), ("adverb"), ("noun")], ids=["verb", "adverb", "noun"])
def test_get_words_list_part_of_speech(self, faker, part_of_speech):
words_list = faker.get_words_list(part_of_speech=part_of_speech)
assert (word in EnUsLoremProvider.parts_of_speech[part_of_speech] for word in words_list)

def test_get_words_list_invalid_part_of_speech(self, faker):
part_of_speech = "invalid part of speech"

@pytest.mark.parametrize("nb,part_of_speech", [(5, "abcdefg")], ids=["invalid part of speech"])
def test_words_invalid_part_of_speech(self, faker, nb, part_of_speech):
with pytest.raises(ValueError) as exc_info:
faker.words(nb=nb, part_of_speech=part_of_speech)
faker.get_words_list(part_of_speech=part_of_speech)

assert exc_info.type is ValueError
assert exc_info.value.args[0] == f"{part_of_speech} is not recognized as a part of speech."

@pytest.mark.parametrize(
"nb,part_of_speech",
[(3, "adverb"), (5, "verb"), (4, "abcdefgh")],
ids=["ignore adverb", "ignore verb", "ignore invalid part of speech"],
)
def test_words_part_of_speech_ignored(self, faker, nb, part_of_speech):
words = faker.words(nb=nb, part_of_speech=part_of_speech, ext_word_list=self.custom_word_list)
assert len(words) == nb
def test_get_words_list_part_of_speech_ignored(self, faker):
words = faker.get_words_list(part_of_speech="ignored part of speech", ext_word_list=self.custom_word_list)
assert all(word in self.custom_word_list for word in words)


Expand Down
32 changes: 32 additions & 0 deletions tests/providers/test_python.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import decimal
import logging
import sys
import unittest
import warnings
Expand Down Expand Up @@ -299,6 +300,37 @@ def test_float_min_and_max_value_with_same_whole(self):
self.fake.pyfloat(min_value=2.3, max_value=2.5)


class TestPyDict(unittest.TestCase):
def setUp(self):
self.fake = Faker()
Faker.seed(0)

def test_pydict_with_default_nb_elements(self):
result = self.fake.pydict()

self.assertEqual(len(result), 10)

def test_pydict_with_valid_number_of_nb_elements(self):
result = self.fake.pydict(nb_elements=5)

self.assertEqual(len(result), 5)

def test_pydict_with_invalid_number_of_nb_elements(self):
nb_elements = 10000

words_list_count = len(self.fake.get_words_list())

logger = logging.getLogger("faker.providers.python")

with patch.object(logger, "warning") as mock_warn:
result = self.fake.pydict(nb_elements=nb_elements)

mock_warn.assert_called_once_with(
f"Number of nb_elements is greater than the number of words in the list. {words_list_count} words will be used."

Check failure on line 329 in tests/providers/test_python.py

View workflow job for this annotation

GitHub Actions / flake8

line too long (128 > 120 characters)
)
self.assertEqual(len(result), words_list_count)


class TestPydecimal(unittest.TestCase):
def setUp(self):
self.fake = Faker()
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/session_overrides/session_locale/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
_MODULE_LOCALES = ["en_GB"]
_MODULE_LOCALES = ["en_US"]

0 comments on commit b01507a

Please sign in to comment.