Skip to content

Commit

Permalink
[fix](statistics)Fix column cached stats size bug. (apache#37545)
Browse files Browse the repository at this point in the history
Fix column cached stats average size calculate bug. Average size is
double type, when calculating, it is calculated by totalSize/rowCount.
We need to cast totalSize to double, otherwise the result is not
accurate, only keep the integer part.
  • Loading branch information
Jibing-Li committed Jul 10, 2024
1 parent 012f26a commit dff9f49
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ public ColumnStatistic toColumnStatistic() {
columnStatisticBuilder.setNdv(ndv);
columnStatisticBuilder.setNumNulls(nullCount);
columnStatisticBuilder.setDataSize(dataSizeInBytes);
columnStatisticBuilder.setAvgSizeByte(count == 0 ? 0 : dataSizeInBytes / count);
columnStatisticBuilder.setAvgSizeByte(count == 0 ? 0 : ((double) dataSizeInBytes) / count);
if (statsId == null) {
return ColumnStatistic.UNKNOWN;
}
Expand Down
73 changes: 62 additions & 11 deletions regression-test/suites/statistics/analyze_stats.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ suite("test_analyze") {
"""

def contains_expected_table = { r ->
for (int i = 0; i < r.size; i++) {
for (int i = 0; i < r.size(); i++) {
if (r[i][3] == "${tbl}") {
return true
}
Expand All @@ -190,7 +190,7 @@ suite("test_analyze") {
}

def stats_job_removed = { r, id ->
for (int i = 0; i < r.size; i++) {
for (int i = 0; i < r.size(); i++) {
if (r[i][0] == id) {
return false
}
Expand Down Expand Up @@ -250,7 +250,7 @@ suite("test_analyze") {
"""

def expected_result = { r->
for(int i = 0; i < r.size; i++) {
for(int i = 0; i < r.size(); i++) {
if ((int) Double.parseDouble(r[i][2]) == 6) {
return true
} else {
Expand Down Expand Up @@ -1150,9 +1150,9 @@ PARTITION `p599` VALUES IN (599)
sql """ INSERT INTO test_updated_rows VALUES('1',1,1); """
def cnt1 = sql """ SHOW TABLE STATS test_updated_rows """
for (int i = 0; i < 10; ++i) {
if (Integer.valueOf(cnt1[0][0]) == 8) break;
Thread.sleep(1000) // rows updated report is async
cnt1 = sql """ SHOW TABLE STATS test_updated_rows """
if (Integer.valueOf(cnt1[0][0]) == 8) break;
Thread.sleep(1000) // rows updated report is async
cnt1 = sql """ SHOW TABLE STATS test_updated_rows """
}
assertEquals(Integer.valueOf(cnt1[0][0]), 1)
sql """ANALYZE TABLE test_updated_rows WITH SYNC"""
Expand All @@ -1162,9 +1162,9 @@ PARTITION `p599` VALUES IN (599)
sql """ANALYZE TABLE test_updated_rows WITH SYNC"""
def cnt2 = sql """ SHOW TABLE STATS test_updated_rows """
for (int i = 0; i < 10; ++i) {
if (Integer.valueOf(cnt2[0][0]) == 8) break;
Thread.sleep(1000) // rows updated report is async
cnt2 = sql """ SHOW TABLE STATS test_updated_rows """
if (Integer.valueOf(cnt2[0][0]) == 8) break;
Thread.sleep(1000) // rows updated report is async
cnt2 = sql """ SHOW TABLE STATS test_updated_rows """
}
assertTrue(Integer.valueOf(cnt2[0][0]) == 0 || Integer.valueOf(cnt2[0][0]) == 8)

Expand Down Expand Up @@ -1214,7 +1214,7 @@ PARTITION `p599` VALUES IN (599)
"""

def tbl_name_as_expetected = { r,name ->
for (int i = 0; i < r.size; i++) {
for (int i = 0; i < r.size(); i++) {
if (r[i][3] != name) {
return false
}
Expand All @@ -1232,7 +1232,7 @@ PARTITION `p599` VALUES IN (599)
assert show_result.size() > 0

def all_finished = { r ->
for (int i = 0; i < r.size; i++) {
for (int i = 0; i < r.size(); i++) {
if (r[i][9] != "FINISHED") {
return false
}
Expand Down Expand Up @@ -2810,6 +2810,57 @@ PARTITION `p599` VALUES IN (599)
result_sample = sql """show analyze task status ${jobId}"""
assertEquals(2, result_sample.size())

// Test inject stats avg_size.
sql """CREATE TABLE `date_dim` (
`d_date_sk` BIGINT NOT NULL,
`d_date_id` CHAR(16) NOT NULL,
`d_date` DATE NULL,
`d_month_seq` INT NULL,
`d_week_seq` INT NULL,
`d_quarter_seq` INT NULL,
`d_year` INT NULL,
`d_dow` INT NULL,
`d_moy` INT NULL,
`d_dom` INT NULL,
`d_qoy` INT NULL,
`d_fy_year` INT NULL,
`d_fy_quarter_seq` INT NULL,
`d_fy_week_seq` INT NULL,
`d_day_name` CHAR(9) NULL,
`d_quarter_name` CHAR(6) NULL,
`d_holiday` CHAR(1) NULL,
`d_weekend` CHAR(1) NULL,
`d_following_holiday` CHAR(1) NULL,
`d_first_dom` INT NULL,
`d_last_dom` INT NULL,
`d_same_day_ly` INT NULL,
`d_same_day_lq` INT NULL,
`d_current_day` CHAR(1) NULL,
`d_current_week` CHAR(1) NULL,
`d_current_month` CHAR(1) NULL,
`d_current_quarter` CHAR(1) NULL,
`d_current_year` CHAR(1) NULL
) ENGINE=OLAP
DUPLICATE KEY(`d_date_sk`)
DISTRIBUTED BY HASH(`d_date_sk`) BUCKETS 12
PROPERTIES (
"replication_allocation" = "tag.location.default: 1")
"""

sql """
alter table date_dim modify column d_day_name set stats ('row_count'='73049', 'ndv'='7', 'num_nulls'='0', 'min_value'='Friday', 'max_value'='Wednesday', 'data_size'='521779')
"""

alter_result = sql """show column cached stats date_dim"""
assertEquals("d_day_name", alter_result[0][0])
assertEquals("date_dim", alter_result[0][1])
assertEquals("73049.0", alter_result[0][2])
assertEquals("7.0", alter_result[0][3])
assertEquals("0.0", alter_result[0][4])
assertEquals("521779.0", alter_result[0][5])
assertEquals("7.142863009760572", alter_result[0][6])


sql """DROP DATABASE IF EXISTS trigger"""
}

0 comments on commit dff9f49

Please sign in to comment.