Merge pull request verilog-to-routing#2799 from verilog-to-routing/te…

…mp_chan_w_factors_prefix_sum Chan x/y placement cost factors using prefix sum
SymbiFlow · Nov 11, 2024 · fdf6d3c · fdf6d3c
2 parents 91f2941 + db3f7ae
commit fdf6d3c
Show file tree

Hide file tree

Showing 18 changed files with 125 additions and 140 deletions.
diff --git a/libs/libvtrutil/src/vtr_ndoffsetmatrix.h b/libs/libvtrutil/src/vtr_ndoffsetmatrix.h
@@ -2,6 +2,7 @@
 #define VTR_ND_OFFSET_MATRIX_H
 #include <array>
 #include <memory>
+#include <algorithm>
 
 #include "vtr_assert.h"
 
@@ -309,9 +310,8 @@ class NdOffsetMatrixBase {
 
     ///@brief Swap two NdOffsetMatrixBase objects
     friend void swap(NdOffsetMatrixBase<T, N>& m1, NdOffsetMatrixBase<T, N>& m2) {
-        using std::swap;
-        swap(m1.dim_ranges_, m2.dim_ranges_);
-        swap(m1.data_, m2.data_);
+        std::swap(m1.dim_ranges_, m2.dim_ranges_);
+        std::swap(m1.data_, m2.data_);
     }
 
   private:
@@ -441,7 +441,9 @@ class NdOffsetMatrix<T, 1> : public NdOffsetMatrixBase<T, 1> {
         VTR_ASSERT_SAFE_MSG(index >= this->dim_ranges_[0].begin_index(), "Index out of range (below dimension minimum)");
         VTR_ASSERT_SAFE_MSG(index < this->dim_ranges_[0].end_index(), "Index out of range (above dimension maximum)");
 
-        return this->data_[index];
+        int effective_index = index - this->dim_ranges_[0].begin_index();
+
+        return this->data_[effective_index];
     }
 
     ///@brief Access an element (mutable)

diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
@@ -633,8 +633,6 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
     PlacerOpts->inner_loop_recompute_divider = Options.inner_loop_recompute_divider;
     PlacerOpts->quench_recompute_divider = Options.quench_recompute_divider;
 
-    PlacerOpts->place_cost_exp = 1;
-
     PlacerOpts->td_place_exp_first = Options.place_exp_first;
 
     PlacerOpts->td_place_exp_last = Options.place_exp_last;

diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
@@ -547,8 +547,6 @@ static void ShowPlacerOpts(const t_placer_opts& PlacerOpts,
             VTR_LOG("Using constraints file '%s'\n", PlacerOpts.constraints_file.c_str());
         }
 
-        VTR_LOG("PlacerOpts.place_cost_exp: %f\n", PlacerOpts.place_cost_exp);
-
         VTR_LOG("PlacerOpts.place_chan_width: %d\n", PlacerOpts.place_chan_width);
 
         if (PlacerOpts.place_algorithm.is_timing_driven()) {

diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
@@ -1065,7 +1065,6 @@ struct t_placer_opts {
     t_place_algorithm place_algorithm;
     t_place_algorithm place_quench_algorithm;
     float timing_tradeoff;
-    float place_cost_exp;
     int place_chan_width;
     enum e_pad_loc_type pad_loc_type;
     std::string constraints_file;

diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp
@@ -151,79 +151,48 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
 }
 
 void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
-    const double place_cost_exp = static_cast<double>(placer_opts_.place_cost_exp);
-    auto& device_ctx = g_vpr_ctx.device();
-
-    const int grid_height = device_ctx.grid.height();
-    const int grid_width = device_ctx.grid.width();
-
-    /* Access arrays below as chan?_place_cost_fac_(subhigh, sublow). Since subhigh must be greater than or
-     * equal to sublow, we will only access the lower half of a matrix, but we allocate the whole matrix anyway
-     * for simplicity, so we can use the vtr utility matrix functions. */
-    chanx_place_cost_fac_ = vtr::NdOffsetMatrix<float, 2>({{{-1, grid_height}, {-1, grid_height}}});
-    chany_place_cost_fac_ = vtr::NdOffsetMatrix<float, 2>({{{-1, grid_width}, {-1, grid_width}}});
-
-    // First compute the number of tracks between channel high and channel low, inclusive.
-    chanx_place_cost_fac_[-1][-1] = 0;
-
-    for (int high = 0; high < grid_height; high++) {
-        chanx_place_cost_fac_[high][high] = (float)device_ctx.chan_width.x_list[high];
-        for (int low = -1; low < high; low++) {
-            chanx_place_cost_fac_[high][low] = chanx_place_cost_fac_[high - 1][low] + (float)device_ctx.chan_width.x_list[high];
-        }
-    }
-
-    /* Now compute the inverse of the average number of tracks per channel *
-     * between high and low. The cost function divides by the average      *
-     * number of tracks per channel, so by storing the inverse I convert   *
-     * this to a faster multiplication.  Take this final number to the     *
-     * place_cost_exp power -- numbers other than one mean this is no      *
-     * longer a simple "average number of tracks"; it is some power of     *
-     * that, allowing greater penalization of narrow channels.             */
-    for (int high = -1; high < grid_height; high++) {
-        for (int low = -1; low <= high; low++) {
-            /* Since we will divide the wiring cost by the average channel *
-             * capacity between high and low, having only 0 width channels *
-             * will result in infinite wiring capacity normalization       *
-             * factor, and extremely bad placer behaviour. Hence we change *
-             * this to a small (1 track) channel capacity instead.         */
-            if (chanx_place_cost_fac_[high][low] == 0.0f) {
-                VTR_LOG_WARN("CHANX place cost fac is 0 at %d %d\n", high, low);
-                chanx_place_cost_fac_[high][low] = 1.0f;
-            }
+    const auto& device_ctx = g_vpr_ctx.device();
 
-            chanx_place_cost_fac_[high][low] = (high - low + 1.) / chanx_place_cost_fac_[high][low];
-            chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], place_cost_exp);
-        }
-    }
+    const int grid_height = (int)device_ctx.grid.height();
+    const int grid_width = (int)device_ctx.grid.width();
+
+    /* These arrays contain accumulative channel width between channel zero and
+     * the channel specified by the given index. The accumulated channel width
+     * is inclusive, meaning that it includes both channel zero and channel `idx`.
+     * To compute the total channel width between channels 'low' and 'high', use the
+     * following formula:
+     *      acc_chan?_width_[high] - acc_chan?_width_[low - 1]
+     * This returns the total number of tracks between channels 'low' and 'high',
+     * including tracks in these channels.
+     *
+     * Channel -1 doesn't exist, so we can say it has zero tracks. We need to be able
+     * to access these arrays with index -1 to handle cases where the lower channel is 0.
+     */
+    acc_chanx_width_ = vtr::NdOffsetMatrix<int, 1>({{{-1, grid_height}}});
+    acc_chany_width_ = vtr::NdOffsetMatrix<int, 1>({{{-1, grid_width}}});
 
-    /* Now do the same thing for the y-directed channels.  First get the
-     * number of tracks between channel high and channel low, inclusive. */
-    chany_place_cost_fac_[-1][-1] = 0;
+    // initialize the first element (index -1) with zero
+    acc_chanx_width_[-1] = 0;
+    for (int y = 0; y < grid_height; y++) {
+        acc_chanx_width_[y] = acc_chanx_width_[y - 1] + device_ctx.chan_width.x_list[y];
 
-    for (int high = 0; high < grid_width; high++) {
-        chany_place_cost_fac_[high][high] = device_ctx.chan_width.y_list[high];
-        for (int low = -1; low < high; low++) {
-            chany_place_cost_fac_[high][low] = chany_place_cost_fac_[high - 1][low] + device_ctx.chan_width.y_list[high];
+        /* If the number of tracks in a channel is zero, two consecutive elements take the same
+         * value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
+         * potential issue, we assume that the channel width is at least 1.
+         */
+        if (acc_chanx_width_[y] == acc_chanx_width_[y - 1]) {
+            acc_chanx_width_[y]++;
         }
     }
 
-    /* Now compute the inverse of the average number of tracks per channel
-     * between high and low.  Take to specified power. */
-    for (int high = -1; high < grid_width; high++) {
-        for (int low = -1; low <= high; low++) {
-            /* Since we will divide the wiring cost by the average channel *
-             * capacity between high and low, having only 0 width channels *
-             * will result in infinite wiring capacity normalization       *
-             * factor, and extremely bad placer behaviour. Hence we change *
-             * this to a small (1 track) channel capacity instead.         */
-            if (chany_place_cost_fac_[high][low] == 0.0f) {
-                VTR_LOG_WARN("CHANY place cost fac is 0 at %d %d\n", high, low);
-                chany_place_cost_fac_[high][low] = 1.0f;
-            }
+    // initialize the first element (index -1) with zero
+    acc_chany_width_[-1] = 0;
+    for (int x = 0; x < grid_width; x++) {
+        acc_chany_width_[x] = acc_chany_width_[x - 1] + device_ctx.chan_width.y_list[x];
 
-            chany_place_cost_fac_[high][low] = (high - low + 1.) / chany_place_cost_fac_[high][low];
-            chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], place_cost_exp);
+        // to avoid a division by zero
+        if (acc_chany_width_[x] == acc_chany_width_[x - 1]) {
+            acc_chany_width_[x]++;
         }
     }
 
@@ -239,33 +208,32 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
     const size_t grid_height = device_ctx.grid.height();
     const size_t grid_width = device_ctx.grid.width();
 
-
-    acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0.); 
+    acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0);
 
     vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);         
 
     /*
-     * Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location 
-     * in the device. We count all these edges, regardless of which layers they connect. Then we divide by 
-     * the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors 
-     * what we do for the horizontal and vertical channels where we assume the channel width doesn't change 
-     * along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited 
+     * Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
+     * in the device. We count all these edges, regardless of which layers they connect. Then we divide by
+     * the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
+     * what we do for the horizontal and vertical channels where we assume the channel width doesn't change
+     * along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
      * if someday we have architectures with widely varying connectivity between different layers in a stack.
-     */                  
+     */
 
     /*
-    * To calculate the accumulative number of inter-die connections we first need to get the number of 
-    * inter-die connection per location. To be able to work for the cases that RR Graph is read instead 
-    * of being made from the architecture file, we calculate this number by iterating over the RR graph. Once 
-    * tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,  
-    * we populate the first row and column. Then, we iterate over the rest of blocks and get the number of 
-    * inter-die connections by adding up the number of inter-die block at that location + the accumulation 
-    * for the  block below and  left to it. Then, since the accumulated number of inter-die connection to 
-    * the block on the lower left connection of the block is added twice, that part needs to be removed.
-    */
-    for (const auto& src_rr_node : rr_graph.nodes()) {
-        for (const auto& rr_edge_idx : rr_graph.edges(src_rr_node)) {
-            const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
+     * To calculate the accumulative number of inter-die connections we first need to get the number of
+     * inter-die connection per location. To be able to work for the cases that RR Graph is read instead
+     * of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
+     * tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,
+     * we populate the first row and column. Then, we iterate over the rest of blocks and get the number of
+     * inter-die connections by adding up the number of inter-die block at that location + the accumulation
+     * for the  block below and  left to it. Then, since the accumulated number of inter-die connection to
+     * the block on the lower left connection of the block is added twice, that part needs to be removed.
+     */
+    for (const RRNodeId src_rr_node : rr_graph.nodes()) {
+        for (const t_edge_size rr_edge_idx : rr_graph.edges(src_rr_node)) {
+            const RRNodeId sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
             if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
                 // We assume that the nodes driving the inter-layer connection or being driven by it
                 // are not stretched across multiple tiles
@@ -290,20 +258,20 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
     // Initialize the first row and column
     for (size_t x = 1; x < device_ctx.grid.width(); x++) {
         acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] +
-                                            tile_num_inter_die_conn[x][0];
+                                             tile_num_inter_die_conn[x][0];
     }
 
     for (size_t y = 1; y < device_ctx.grid.height(); y++) {
         acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] +
-                                            tile_num_inter_die_conn[0][y];
+                                             tile_num_inter_die_conn[0][y];
     }
 
     for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) {
         for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) {
             acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] +
-                                                          acc_tile_num_inter_die_conn_[x_high][y_high-1] +
-                                                          tile_num_inter_die_conn[x_high][y_high] -
-                                                          acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
+                                                           acc_tile_num_inter_die_conn_[x_high][y_high-1] +
+                                                           tile_num_inter_die_conn[x_high][y_high] -
+                                                           acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
         }
     }
 }
@@ -1421,7 +1389,7 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
 
     const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id];
 
-    double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
+    const double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
 
     /* Could insert a check for xmin == xmax.  In that case, assume  *
      * connection will be made with no bends and hence no x-cost.    *
@@ -1437,8 +1405,9 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
      */
 
     double ncost;
-    ncost = (bb.xmax - bb.xmin + 1) * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
-    ncost += (bb.ymax - bb.ymin + 1) * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
+    const auto [chanx_cost_fac, chany_cost_fac] = get_chanxy_cost_fac_(bb);
+    ncost = (bb.xmax - bb.xmin + 1) * chanx_cost_fac;
+    ncost += (bb.ymax - bb.ymin + 1) * chany_cost_fac;
     if (is_multi_layer_) {
         ncost += (bb.layer_max - bb.layer_min) * get_chanz_cost_factor_(bb);
     }
@@ -1448,6 +1417,7 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
     return ncost;
 }
 
+
 double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use_ts) {
     const auto& move_ctx = placer_state_.move();
 
@@ -1469,7 +1439,7 @@ double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use
         /* Adjust the bounding box half perimeter by the wirelength correction
          * factor based on terminal count, which is 1 for the source + the number
          * of sinks on this layer. */
-        double crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
+        const double crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
 
         /* Could insert a check for xmin == xmax.  In that case, assume  *
          * connection will be made with no bends and hence no x-cost.    *
@@ -1484,11 +1454,10 @@ double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use
          * chan?_place_cost_fac_ objects can handle -1 indices internally.
          */
 
-        ncost += (bb[layer_num].xmax - bb[layer_num].xmin + 1) * crossing
-                 * chanx_place_cost_fac_[bb[layer_num].ymax][bb[layer_num].ymin - 1];
-
-        ncost += (bb[layer_num].ymax - bb[layer_num].ymin + 1) * crossing
-                 * chany_place_cost_fac_[bb[layer_num].xmax][bb[layer_num].xmin - 1];
+        const auto[chanx_cost_fac, chany_cost_fac] = get_chanxy_cost_fac_(bb[layer_num]);
+        ncost += (bb[layer_num].xmax - bb[layer_num].xmin + 1) * chanx_cost_fac;
+        ncost += (bb[layer_num].ymax - bb[layer_num].ymin + 1) * chany_cost_fac;
+        ncost *= crossing;
     }
 
     return ncost;
@@ -1546,8 +1515,6 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) {
 }
 
 float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
-    float place_cost_exp = placer_opts_.place_cost_exp;
-
     int num_inter_dir_conn;
 
     if (bb.xmin == 0 && bb.ymin == 0) {
@@ -1571,7 +1538,6 @@ float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
     } else {
         int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
         z_cost_factor = bb_num_tiles / static_cast<float>(num_inter_dir_conn);
-        z_cost_factor = pow((double)z_cost_factor, (double)place_cost_exp);
     }
 
     return z_cost_factor;