Skip to content

Commit

Permalink
Merge pull request verilog-to-routing#2799 from verilog-to-routing/te…
Browse files Browse the repository at this point in the history
…mp_chan_w_factors_prefix_sum

Chan x/y placement cost factors using prefix sum
  • Loading branch information
vaughnbetz authored Nov 11, 2024
2 parents 91f2941 + db3f7ae commit fdf6d3c
Show file tree
Hide file tree
Showing 18 changed files with 125 additions and 140 deletions.
10 changes: 6 additions & 4 deletions libs/libvtrutil/src/vtr_ndoffsetmatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define VTR_ND_OFFSET_MATRIX_H
#include <array>
#include <memory>
#include <algorithm>

#include "vtr_assert.h"

Expand Down Expand Up @@ -309,9 +310,8 @@ class NdOffsetMatrixBase {

///@brief Swap two NdOffsetMatrixBase objects
friend void swap(NdOffsetMatrixBase<T, N>& m1, NdOffsetMatrixBase<T, N>& m2) {
using std::swap;
swap(m1.dim_ranges_, m2.dim_ranges_);
swap(m1.data_, m2.data_);
std::swap(m1.dim_ranges_, m2.dim_ranges_);
std::swap(m1.data_, m2.data_);
}

private:
Expand Down Expand Up @@ -441,7 +441,9 @@ class NdOffsetMatrix<T, 1> : public NdOffsetMatrixBase<T, 1> {
VTR_ASSERT_SAFE_MSG(index >= this->dim_ranges_[0].begin_index(), "Index out of range (below dimension minimum)");
VTR_ASSERT_SAFE_MSG(index < this->dim_ranges_[0].end_index(), "Index out of range (above dimension maximum)");

return this->data_[index];
int effective_index = index - this->dim_ranges_[0].begin_index();

return this->data_[effective_index];
}

///@brief Access an element (mutable)
Expand Down
2 changes: 0 additions & 2 deletions vpr/src/base/SetupVPR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,8 +633,6 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
PlacerOpts->inner_loop_recompute_divider = Options.inner_loop_recompute_divider;
PlacerOpts->quench_recompute_divider = Options.quench_recompute_divider;

PlacerOpts->place_cost_exp = 1;

PlacerOpts->td_place_exp_first = Options.place_exp_first;

PlacerOpts->td_place_exp_last = Options.place_exp_last;
Expand Down
2 changes: 0 additions & 2 deletions vpr/src/base/ShowSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,8 +547,6 @@ static void ShowPlacerOpts(const t_placer_opts& PlacerOpts,
VTR_LOG("Using constraints file '%s'\n", PlacerOpts.constraints_file.c_str());
}

VTR_LOG("PlacerOpts.place_cost_exp: %f\n", PlacerOpts.place_cost_exp);

VTR_LOG("PlacerOpts.place_chan_width: %d\n", PlacerOpts.place_chan_width);

if (PlacerOpts.place_algorithm.is_timing_driven()) {
Expand Down
1 change: 0 additions & 1 deletion vpr/src/base/vpr_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,6 @@ struct t_placer_opts {
t_place_algorithm place_algorithm;
t_place_algorithm place_quench_algorithm;
float timing_tradeoff;
float place_cost_exp;
int place_chan_width;
enum e_pad_loc_type pad_loc_type;
std::string constraints_file;
Expand Down
172 changes: 69 additions & 103 deletions vpr/src/place/net_cost_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,79 +151,48 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
}

void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
const double place_cost_exp = static_cast<double>(placer_opts_.place_cost_exp);
auto& device_ctx = g_vpr_ctx.device();

const int grid_height = device_ctx.grid.height();
const int grid_width = device_ctx.grid.width();

/* Access arrays below as chan?_place_cost_fac_(subhigh, sublow). Since subhigh must be greater than or
* equal to sublow, we will only access the lower half of a matrix, but we allocate the whole matrix anyway
* for simplicity, so we can use the vtr utility matrix functions. */
chanx_place_cost_fac_ = vtr::NdOffsetMatrix<float, 2>({{{-1, grid_height}, {-1, grid_height}}});
chany_place_cost_fac_ = vtr::NdOffsetMatrix<float, 2>({{{-1, grid_width}, {-1, grid_width}}});

// First compute the number of tracks between channel high and channel low, inclusive.
chanx_place_cost_fac_[-1][-1] = 0;

for (int high = 0; high < grid_height; high++) {
chanx_place_cost_fac_[high][high] = (float)device_ctx.chan_width.x_list[high];
for (int low = -1; low < high; low++) {
chanx_place_cost_fac_[high][low] = chanx_place_cost_fac_[high - 1][low] + (float)device_ctx.chan_width.x_list[high];
}
}

/* Now compute the inverse of the average number of tracks per channel *
* between high and low. The cost function divides by the average *
* number of tracks per channel, so by storing the inverse I convert *
* this to a faster multiplication. Take this final number to the *
* place_cost_exp power -- numbers other than one mean this is no *
* longer a simple "average number of tracks"; it is some power of *
* that, allowing greater penalization of narrow channels. */
for (int high = -1; high < grid_height; high++) {
for (int low = -1; low <= high; low++) {
/* Since we will divide the wiring cost by the average channel *
* capacity between high and low, having only 0 width channels *
* will result in infinite wiring capacity normalization *
* factor, and extremely bad placer behaviour. Hence we change *
* this to a small (1 track) channel capacity instead. */
if (chanx_place_cost_fac_[high][low] == 0.0f) {
VTR_LOG_WARN("CHANX place cost fac is 0 at %d %d\n", high, low);
chanx_place_cost_fac_[high][low] = 1.0f;
}
const auto& device_ctx = g_vpr_ctx.device();

chanx_place_cost_fac_[high][low] = (high - low + 1.) / chanx_place_cost_fac_[high][low];
chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], place_cost_exp);
}
}
const int grid_height = (int)device_ctx.grid.height();
const int grid_width = (int)device_ctx.grid.width();

/* These arrays contain accumulative channel width between channel zero and
* the channel specified by the given index. The accumulated channel width
* is inclusive, meaning that it includes both channel zero and channel `idx`.
* To compute the total channel width between channels 'low' and 'high', use the
* following formula:
* acc_chan?_width_[high] - acc_chan?_width_[low - 1]
* This returns the total number of tracks between channels 'low' and 'high',
* including tracks in these channels.
*
* Channel -1 doesn't exist, so we can say it has zero tracks. We need to be able
* to access these arrays with index -1 to handle cases where the lower channel is 0.
*/
acc_chanx_width_ = vtr::NdOffsetMatrix<int, 1>({{{-1, grid_height}}});
acc_chany_width_ = vtr::NdOffsetMatrix<int, 1>({{{-1, grid_width}}});

/* Now do the same thing for the y-directed channels. First get the
* number of tracks between channel high and channel low, inclusive. */
chany_place_cost_fac_[-1][-1] = 0;
// initialize the first element (index -1) with zero
acc_chanx_width_[-1] = 0;
for (int y = 0; y < grid_height; y++) {
acc_chanx_width_[y] = acc_chanx_width_[y - 1] + device_ctx.chan_width.x_list[y];

for (int high = 0; high < grid_width; high++) {
chany_place_cost_fac_[high][high] = device_ctx.chan_width.y_list[high];
for (int low = -1; low < high; low++) {
chany_place_cost_fac_[high][low] = chany_place_cost_fac_[high - 1][low] + device_ctx.chan_width.y_list[high];
/* If the number of tracks in a channel is zero, two consecutive elements take the same
* value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
* potential issue, we assume that the channel width is at least 1.
*/
if (acc_chanx_width_[y] == acc_chanx_width_[y - 1]) {
acc_chanx_width_[y]++;
}
}

/* Now compute the inverse of the average number of tracks per channel
* between high and low. Take to specified power. */
for (int high = -1; high < grid_width; high++) {
for (int low = -1; low <= high; low++) {
/* Since we will divide the wiring cost by the average channel *
* capacity between high and low, having only 0 width channels *
* will result in infinite wiring capacity normalization *
* factor, and extremely bad placer behaviour. Hence we change *
* this to a small (1 track) channel capacity instead. */
if (chany_place_cost_fac_[high][low] == 0.0f) {
VTR_LOG_WARN("CHANY place cost fac is 0 at %d %d\n", high, low);
chany_place_cost_fac_[high][low] = 1.0f;
}
// initialize the first element (index -1) with zero
acc_chany_width_[-1] = 0;
for (int x = 0; x < grid_width; x++) {
acc_chany_width_[x] = acc_chany_width_[x - 1] + device_ctx.chan_width.y_list[x];

chany_place_cost_fac_[high][low] = (high - low + 1.) / chany_place_cost_fac_[high][low];
chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], place_cost_exp);
// to avoid a division by zero
if (acc_chany_width_[x] == acc_chany_width_[x - 1]) {
acc_chany_width_[x]++;
}
}

Expand All @@ -239,33 +208,32 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
const size_t grid_height = device_ctx.grid.height();
const size_t grid_width = device_ctx.grid.width();


acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0.);
acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0);

vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);

/*
* Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
* in the device. We count all these edges, regardless of which layers they connect. Then we divide by
* the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
* what we do for the horizontal and vertical channels where we assume the channel width doesn't change
* along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
* Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
* in the device. We count all these edges, regardless of which layers they connect. Then we divide by
* the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
* what we do for the horizontal and vertical channels where we assume the channel width doesn't change
* along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
* if someday we have architectures with widely varying connectivity between different layers in a stack.
*/
*/

/*
* To calculate the accumulative number of inter-die connections we first need to get the number of
* inter-die connection per location. To be able to work for the cases that RR Graph is read instead
* of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
* tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,
* we populate the first row and column. Then, we iterate over the rest of blocks and get the number of
* inter-die connections by adding up the number of inter-die block at that location + the accumulation
* for the block below and left to it. Then, since the accumulated number of inter-die connection to
* the block on the lower left connection of the block is added twice, that part needs to be removed.
*/
for (const auto& src_rr_node : rr_graph.nodes()) {
for (const auto& rr_edge_idx : rr_graph.edges(src_rr_node)) {
const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
* To calculate the accumulative number of inter-die connections we first need to get the number of
* inter-die connection per location. To be able to work for the cases that RR Graph is read instead
* of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
* tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,
* we populate the first row and column. Then, we iterate over the rest of blocks and get the number of
* inter-die connections by adding up the number of inter-die block at that location + the accumulation
* for the block below and left to it. Then, since the accumulated number of inter-die connection to
* the block on the lower left connection of the block is added twice, that part needs to be removed.
*/
for (const RRNodeId src_rr_node : rr_graph.nodes()) {
for (const t_edge_size rr_edge_idx : rr_graph.edges(src_rr_node)) {
const RRNodeId sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
// We assume that the nodes driving the inter-layer connection or being driven by it
// are not stretched across multiple tiles
Expand All @@ -290,20 +258,20 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
// Initialize the first row and column
for (size_t x = 1; x < device_ctx.grid.width(); x++) {
acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] +
tile_num_inter_die_conn[x][0];
tile_num_inter_die_conn[x][0];
}

for (size_t y = 1; y < device_ctx.grid.height(); y++) {
acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] +
tile_num_inter_die_conn[0][y];
tile_num_inter_die_conn[0][y];
}

for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) {
for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) {
acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] +
acc_tile_num_inter_die_conn_[x_high][y_high-1] +
tile_num_inter_die_conn[x_high][y_high] -
acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
acc_tile_num_inter_die_conn_[x_high][y_high-1] +
tile_num_inter_die_conn[x_high][y_high] -
acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
}
}
}
Expand Down Expand Up @@ -1421,7 +1389,7 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {

const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id];

double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
const double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());

/* Could insert a check for xmin == xmax. In that case, assume *
* connection will be made with no bends and hence no x-cost. *
Expand All @@ -1437,8 +1405,9 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
*/

double ncost;
ncost = (bb.xmax - bb.xmin + 1) * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
ncost += (bb.ymax - bb.ymin + 1) * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
const auto [chanx_cost_fac, chany_cost_fac] = get_chanxy_cost_fac_(bb);
ncost = (bb.xmax - bb.xmin + 1) * chanx_cost_fac;
ncost += (bb.ymax - bb.ymin + 1) * chany_cost_fac;
if (is_multi_layer_) {
ncost += (bb.layer_max - bb.layer_min) * get_chanz_cost_factor_(bb);
}
Expand All @@ -1448,6 +1417,7 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
return ncost;
}


double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use_ts) {
const auto& move_ctx = placer_state_.move();

Expand All @@ -1469,7 +1439,7 @@ double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use
/* Adjust the bounding box half perimeter by the wirelength correction
* factor based on terminal count, which is 1 for the source + the number
* of sinks on this layer. */
double crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
const double crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);

/* Could insert a check for xmin == xmax. In that case, assume *
* connection will be made with no bends and hence no x-cost. *
Expand All @@ -1484,11 +1454,10 @@ double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use
* chan?_place_cost_fac_ objects can handle -1 indices internally.
*/

ncost += (bb[layer_num].xmax - bb[layer_num].xmin + 1) * crossing
* chanx_place_cost_fac_[bb[layer_num].ymax][bb[layer_num].ymin - 1];

ncost += (bb[layer_num].ymax - bb[layer_num].ymin + 1) * crossing
* chany_place_cost_fac_[bb[layer_num].xmax][bb[layer_num].xmin - 1];
const auto[chanx_cost_fac, chany_cost_fac] = get_chanxy_cost_fac_(bb[layer_num]);
ncost += (bb[layer_num].xmax - bb[layer_num].xmin + 1) * chanx_cost_fac;
ncost += (bb[layer_num].ymax - bb[layer_num].ymin + 1) * chany_cost_fac;
ncost *= crossing;
}

return ncost;
Expand Down Expand Up @@ -1546,8 +1515,6 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) {
}

float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
float place_cost_exp = placer_opts_.place_cost_exp;

int num_inter_dir_conn;

if (bb.xmin == 0 && bb.ymin == 0) {
Expand All @@ -1571,7 +1538,6 @@ float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
} else {
int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
z_cost_factor = bb_num_tiles / static_cast<float>(num_inter_dir_conn);
z_cost_factor = pow((double)z_cost_factor, (double)place_cost_exp);
}

return z_cost_factor;
Expand Down
Loading

0 comments on commit fdf6d3c

Please sign in to comment.