Merge pull request #74 from Netflix-Skunkworks/ramsrivatsak/cpu-utili…

…zation-cass Add cpu utilization as a parameter for the capacity planner to consume
Netflix-Skunkworks · Oct 26, 2023 · f819b9d · f819b9d
2 parents a7caf1d + de475e0
commit f819b9d
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 6 deletions.
diff --git a/service_capacity_modeling/capacity_planner.py b/service_capacity_modeling/capacity_planner.py
@@ -24,6 +24,7 @@
 from service_capacity_modeling.interface import certain_float
 from service_capacity_modeling.interface import DataShape
 from service_capacity_modeling.interface import Drive
+from service_capacity_modeling.interface import Hardware
 from service_capacity_modeling.interface import Instance
 from service_capacity_modeling.interface import Interval
 from service_capacity_modeling.interface import interval
@@ -182,6 +183,21 @@ def model_desires_percentiles(
     return results, d
 
 
+def _set_instance_objects(
+    desires: CapacityDesires,
+    hardware: Hardware,
+):
+    if desires.current_clusters:
+        for zonal_cluster_capacity in desires.current_clusters.zonal:
+            zonal_cluster_capacity.cluster_instance = hardware.instances[
+                zonal_cluster_capacity.cluster_instance_name
+            ]
+        for regional_cluster_capacity in desires.current_clusters.regional:
+            regional_cluster_capacity.cluster_instance = hardware.instances[
+                regional_cluster_capacity.cluster_instance_name
+            ]
+
+
 def _allow_instance(
     instance: Instance,
     allowed_names: Sequence[str],
@@ -575,6 +591,9 @@ def generate_scenarios(
         if len(allowed_drives) == 0:
             allowed_drives.update(hardware.drives.keys())
 
+        # Set current instance object if exists
+        _set_instance_objects(desires, hardware)
+
         if model.run_hardware_simulation():
             for instance in hardware.instances.values():
                 if not _allow_instance(

diff --git a/service_capacity_modeling/interface.py b/service_capacity_modeling/interface.py
@@ -18,7 +18,6 @@
 from pydantic import BaseModel
 from pydantic import Field
 
-
 GIB_IN_BYTES = 1024 * 1024 * 1024
 MIB_IN_BYTES = 1024 * 1024
 MEGABIT_IN_BYTES = (1000 * 1000) / 8
@@ -621,6 +620,29 @@ class DataShape(ExcludeUnsetModel):
     )
 
 
+class CurrentClusterCapacity(ExcludeUnsetModel):
+    cluster_instance_name: str
+    cluster_instance: Optional[Instance] = None
+    cluster_instance_count: Interval
+    cpu_utilization: Interval
+
+
+# For services that are provisioned by zone (e.g. Cassandra, EVCache)
+class CurrentZoneClusterCapacity(CurrentClusterCapacity):
+    pass
+
+
+# For services that are provisioned regionally (e.g. Java services, RDS, etc ..)
+class CurrentRegionClusterCapacity(CurrentClusterCapacity):
+    pass
+
+
+class CurrentClusters(ExcludeUnsetModel):
+    zonal: Sequence[CurrentZoneClusterCapacity] = []
+    regional: Sequence[CurrentRegionClusterCapacity] = []
+    services: Sequence[ServiceCapacity] = []
+
+
 class CapacityDesires(ExcludeUnsetModel):
     # How critical is this cluster, impacts how much "extra" we provision
     # 0 = Critical to the product            (Product does not function)
@@ -635,6 +657,9 @@ class CapacityDesires(ExcludeUnsetModel):
     # What will the state look like
     data_shape: DataShape = DataShape()
 
+    # What is the current microarchitectural/system configuration of the system
+    current_clusters: Optional[CurrentClusters] = None
+
     # When users are providing latency estimates, what is the typical
     # instance core frequency we are comparing to. Databases use i3s a lot
     # hence this default

diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py
@@ -68,6 +68,7 @@ def _estimate_cassandra_requirement(
     working_set: float,
     reads_per_second: float,
     max_rps_to_disk: int,
+    required_cluster_size: Optional[int] = None,
     zones_per_region: int = 3,
     copies_per_region: int = 3,
 ) -> CapacityRequirement:
@@ -76,8 +77,28 @@ def _estimate_cassandra_requirement(
     The input desires should be the **regional** desire, and this function will
     return the zonal capacity requirement
     """
-    # Keep half of the cores free for background work (compaction, backup, repair)
-    needed_cores = sqrt_staffed_cores(desires) * 2
+    current_capacity = (
+        None
+        if desires.current_clusters is None
+        else desires.current_clusters.zonal[0]
+        if len(desires.current_clusters.zonal)
+        else desires.current_clusters.regional[0]
+    )
+    # Keep half of the cores free for background work (compaction, backup, repair).
+    # Currently, zones and regions are configured in a homogeneous manner. Hence,
+    # we just take any one of the current cluster configuration
+    if (
+        current_capacity
+        and current_capacity.cluster_instance
+        and required_cluster_size is not None
+    ):
+        needed_cores = (
+            current_capacity.cluster_instance.cpu
+            * required_cluster_size
+            * zones_per_region
+        ) * (current_capacity.cpu_utilization.high / 20)
+    else:
+        needed_cores = sqrt_staffed_cores(desires) * 2
     # Keep half of the bandwidth available for backup
     needed_network_mbps = simple_network_mbps(desires) * 2
 
@@ -183,7 +204,6 @@ def _estimate_cassandra_cluster_zonal(
     max_write_buffer_percent: float = 0.25,
     max_table_buffer_percent: float = 0.11,
 ) -> Optional[CapacityPlan]:
-
     # Netflix Cassandra doesn't like to deploy on really small instances
     if instance.cpu < 2 or instance.ram_gib < 14:
         return None
@@ -238,6 +258,7 @@ def _estimate_cassandra_cluster_zonal(
         working_set=working_set,
         reads_per_second=rps,
         max_rps_to_disk=max_rps_to_disk,
+        required_cluster_size=required_cluster_size,
         zones_per_region=zones_per_region,
         copies_per_region=copies_per_region,
     )

diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py
@@ -1,15 +1,18 @@
 from service_capacity_modeling.capacity_planner import planner
 from service_capacity_modeling.interface import AccessConsistency
+from service_capacity_modeling.interface import AccessPattern
 from service_capacity_modeling.interface import CapacityDesires
 from service_capacity_modeling.interface import certain_float
 from service_capacity_modeling.interface import certain_int
 from service_capacity_modeling.interface import Consistency
+from service_capacity_modeling.interface import CurrentClusterCapacity
+from service_capacity_modeling.interface import CurrentClusters
 from service_capacity_modeling.interface import DataShape
 from service_capacity_modeling.interface import FixedInterval
 from service_capacity_modeling.interface import GlobalConsistency
+from service_capacity_modeling.interface import Interval
 from service_capacity_modeling.interface import QueryPattern
 
-
 small_but_high_qps = CapacityDesires(
     service_tier=1,
     query_pattern=QueryPattern(
@@ -301,3 +304,52 @@ def test_reduced_durability():
         cheap_plan.candidate_clusters.zonal[0].cluster_params["cassandra.keyspace.rf"]
         == 2
     )
+
+
+def test_plan_certain():
+    """
+    Use cpu utilization to determine instance types directly as supposed to
+    extrapolating it from the Data Shape
+    """
+    cluster_capacity = CurrentClusterCapacity(
+        cluster_instance_name="i4i.8xlarge",
+        cluster_instance_count=Interval(low=8, mid=8, high=8, confidence=1),
+        cpu_utilization=Interval(
+            low=10.12, mid=13.2, high=14.194801291058118, confidence=1
+        ),
+    )
+
+    worn_desire = CapacityDesires(
+        service_tier=1,
+        current_clusters=CurrentClusters(zonal=[cluster_capacity]),
+        query_pattern=QueryPattern(
+            access_pattern=AccessPattern(AccessPattern.latency),
+            estimated_read_per_second=Interval(
+                low=234248, mid=351854, high=485906, confidence=0.98
+            ),
+            estimated_write_per_second=Interval(
+                low=19841, mid=31198, high=37307, confidence=0.98
+            ),
+        ),
+        # We think we're going to have around 200 TiB of data
+        data_shape=DataShape(
+            estimated_state_size_gib=Interval(
+                low=2006.083, mid=2252.5, high=2480.41, confidence=0.98
+            ),
+            estimated_compression_ratio=Interval(low=1, mid=1, high=1, confidence=1),
+        ),
+    )
+    cap_plan = planner.plan_certain(
+        model_name="org.netflix.cassandra",
+        region="us-east-1",
+        num_results=3,
+        num_regions=4,
+        desires=worn_desire,
+        extra_model_arguments={
+            "required_cluster_size": 8,
+        },
+    )
+
+    lr_clusters = cap_plan[0].candidate_clusters.zonal[0]
+    assert lr_clusters.count == 8
+    assert lr_clusters.instance.cpu == 16
diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py
@@ -4,7 +4,6 @@
 from service_capacity_modeling.interface import Interval
 from service_capacity_modeling.interface import QueryPattern
 
-
 uncertain_mid = CapacityDesires(
     service_tier=1,
     query_pattern=QueryPattern(