Adding sampling tests and optimize imports

MindSetLib · May 13, 2024 · 1479535 · 1479535
1 parent 3f0df9e
commit 1479535
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 14 deletions.
diff --git a/insolver/feature_engineering/sampling.py b/insolver/feature_engineering/sampling.py
@@ -1,5 +1,6 @@
-import pandas as pd
-import numpy as np
+from numpy import arange, repeat, delete, append
+from numpy.random import choice
+from pandas import DataFrame, concat
 
 
 class Sampling:
@@ -33,7 +34,7 @@ def sample_dataset(self, df):
             df (pandas.Dataframe): The dataframe.
 
         Raises:
-            NotImplementedError: If self.method is not supported.
+            NotImplementedError: If `self.method` is not supported.
 
         Returns:
             New dataset with selected rows.
@@ -80,7 +81,7 @@ def _systematic_sampling(self, df):
             New dataset with selected rows.
         """
         # get indexes with selected step
-        indexes = np.arange(0, len(df), step=self.n)
+        indexes = arange(0, len(df), step=self.n)
         # get only selected indexes
         systematic_sample = df.iloc[indexes]
         return systematic_sample
@@ -100,22 +101,22 @@ def _cluster_sampling(self, df):
         # count clusters to check
         clusters_count = cluster_df['cluster_id'].unique().sum()
 
-        cluster_sample = pd.DataFrame()
+        cluster_sample = DataFrame()
 
         # if the selected number of clusters is bigger then the created number raise error
         if self.n > clusters_count:
-            raise Exception(f'{self.n} cannot be bigger then number of clusters.')
+            raise ValueError(f'{self.n} cannot be bigger then number of clusters.')
 
         # if the selected number of clusters equals the created number return df
         elif self.n == clusters_count:
             return df
 
         else:
             # randomly chose clusters to keep
-            clusters_to_keep = np.random.choice(cluster_df['cluster_id'].unique(), self.n)
+            clusters_to_keep = choice(cluster_df['cluster_id'].unique(), self.n)
             for cluster in clusters_to_keep:
                 # create a new DataFrame only with the selected clusters
-                cluster_sample = pd.concat([cluster_sample, cluster_df[cluster_df['cluster_id'] == cluster]])
+                cluster_sample = concat([cluster_sample, cluster_df[cluster_df['cluster_id'] == cluster]])
 
         return cluster_sample
 
@@ -132,13 +133,13 @@ def _stratified_sampling(self, df):
         # create clusters
         cluster_df = self._create_clusters(df)
 
-        stratified_sample = pd.DataFrame()
+        stratified_sample = DataFrame()
 
         for cluster in cluster_df['cluster_id'].unique():
             # get selected number of values from each cluster
             sample_cluster = cluster_df[cluster_df['cluster_id'] == cluster].sample(n=self.n)
             # create a new DataFrame only with the selected values in the cluster
-            stratified_sample = pd.concat([stratified_sample, sample_cluster])
+            stratified_sample = concat([stratified_sample, sample_cluster])
 
         return stratified_sample
 
@@ -174,22 +175,22 @@ def _create_clusters(self, df):
         else:
             try:
                 # try if the clusters can be filled exactly
-                new_df['cluster_id'] = np.repeat([range(1, self.n_clusters + 1)], cluster_size)
+                new_df['cluster_id'] = repeat(list(range(1, self.n_clusters + 1)), cluster_size)
 
             except ValueError:
                 # if not get indexes
-                indexes = np.repeat([range(1, self.n_clusters + 1)], cluster_size)
+                indexes = repeat(list(range(1, self.n_clusters + 1)), cluster_size)
                 # calculate the difference
                 diff = len(indexes) - len(df)
 
                 # if the difference is greater than 0 delete one row
                 if diff > 0:
                     for i in range(diff):
-                        new_df['cluster_id'] = np.delete(indexes, len(indexes) - 1)
+                        new_df['cluster_id'] = delete(indexes, len(indexes) - 1)
 
                 # if the difference is less than 0 add one row
                 if diff < 0:
                     for i in range(abs(diff)):
-                        new_df['cluster_id'] = np.append(indexes, self.n_clusters)
+                        new_df['cluster_id'] = append(indexes, self.n_clusters)
 
         return new_df
diff --git a/tests/feature_engineering/test_sampling.py b/tests/feature_engineering/test_sampling.py
@@ -0,0 +1,68 @@
+import pytest
+import pandas as pd
+import numpy as np
+from insolver.feature_engineering import Sampling
+
+
+@pytest.fixture
+def sample_data():
+    return pd.DataFrame({'A': np.arange(100), 'B': np.random.rand(100), 'Cluster': np.repeat(np.arange(10), 10)})
+
+
+def test_simple_sampling(sample_data):
+    sampler = Sampling(n=5, method='simple')
+    sampled_df = sampler.sample_dataset(sample_data)
+    assert len(sampled_df) == 5
+
+
+def test_systematic_sampling(sample_data):
+    sampler = Sampling(n=10, method='systematic')
+    sampled_df = sampler.sample_dataset(sample_data)
+    assert len(sampled_df) == 10
+
+
+def test_cluster_sampling(sample_data):
+    sampler = Sampling(n=2, method='cluster')
+    sampled_df = sampler.sample_dataset(sample_data)
+    assert len(sampled_df) == 20  # Two clusters chosen, each with 10 elements
+
+
+def test_stratified_sampling(sample_data):
+    sampler = Sampling(n=2, method='stratified', cluster_column='Cluster')
+    sampled_df = sampler.sample_dataset(sample_data)
+    assert len(sampled_df) == 20  # Two elements chosen from each cluster
+
+
+def test_method_not_supported(sample_data):
+    sampler = Sampling(n=5, method='unsupported_method')
+    with pytest.raises(NotImplementedError):
+        sampler.sample_dataset(sample_data)
+
+
+def test_cluster_sampling_with_invalid_n(sample_data):
+    sampler = Sampling(n=150, method='cluster')
+    with pytest.raises(ValueError):
+        sampler.sample_dataset(sample_data)
+    sampler = Sampling(n=55, method='cluster')
+    sampler.sample_dataset(sample_data)
+
+
+def test_create_clusters_with_null_values():
+    with pytest.raises(ValueError):
+        sampler = Sampling(n=5, method='cluster', cluster_column='Cluster')
+        sampler._create_clusters(pd.DataFrame({'Cluster': [1, 2, np.nan, 4]}))
+
+
+def test_create_clusters_with_insufficient_data():
+    with pytest.raises(ValueError):
+        sampler = Sampling(n=10, n_clusters=10, method='cluster')
+        sampler._create_clusters(pd.DataFrame({'A': [1, 2, 3]}))
+
+
+def test_create_clusters_with_extra_data():
+    data = {'A': np.arange(7)}
+    df = pd.DataFrame(data)
+    sampler = Sampling(n=5, n_clusters=2, method='cluster')
+    new_df = sampler._create_clusters(df)
+    assert len(new_df) == 7
+    assert all(1 <= cluster_id <= 2 for cluster_id in new_df['cluster_id'])