Luis Fernandez
06/01/2022, 2:16 PMSELECT product_id, SUM(impression_count) as impression_count, SUM(click_count) as click_count, SUM(cost) as spent_total FROM metrics
WHERE user_id = xx AND serve_time BETWEEN 1641013200 AND 1654092017
GROUP BY product_id
LIMIT 100000
this is an example of a query we are running
"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 1317,
"numSegmentsProcessed": 168,
"numSegmentsMatched": 117,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 69212,
"numEntriesScannedInFilter": 1165155303,
"numEntriesScannedPostFilter": 415272,
"numGroupsLimitReached": false,
"totalDocs": 10362679599,
"timeUsedMs": 4623,
this is some of the stats that come back. our data resolution is hourly for this data. Do you all have any idea how to make a query like this perform better?
We have an idea of making the records not have hourly resolution after certain period of time but have daily resolution so that the data is compressed even further but wanted to ask if there are any methods we could use for this and if the method we can implement makes sense to you all.Diogo Baeder
06/01/2022, 2:26 PMserve_time
is either the time column or a column determined as a partition for the segments - so that you can prune segments to avoid searching too many of them -, and maybe creating an inverted index on user_id
. I don't know, however, if segment partitioning works with realtime ingestion (which I'm guessing is your case).Luis Fernandez
06/01/2022, 2:29 PMserve_time
is the timecolumn in our config "timeColumnName": "serve_time",
in this case we are querying an offline tableDiogo Baeder
06/01/2022, 2:30 PMKishore G
"numEntriesScannedInFilter": 1165155303,
Kishore G
Kishore G
Luis Fernandez
06/01/2022, 2:33 PM"segmentPartitionConfig": {
"columnPartitionMap": {
"user_id": {
"functionName": "Murmur",
"numPartitions": 8
}
}
},
"sortedColumn": [
"user_id"
],
"bloomFilterColumns": [
"user_id",
"product_id"
],
Luis Fernandez
06/01/2022, 2:33 PM"rangeIndexColumns": [],
"rangeIndexVersion": 2,
Luis Fernandez
06/01/2022, 2:35 PMDiogo Baeder
06/01/2022, 2:36 PMMayank
Luis Fernandez
06/01/2022, 2:44 PMLuis Fernandez
06/01/2022, 2:47 PMKishore G
Luis Fernandez
06/01/2022, 2:49 PMLuis Fernandez
06/01/2022, 2:50 PMLuis Fernandez
06/01/2022, 2:50 PMKishore G
Mayank
Mayank
Luis Fernandez
06/01/2022, 2:52 PMMayank
Mayank
Luis Fernandez
06/01/2022, 2:54 PMLuis Fernandez
06/01/2022, 2:54 PM{
"OFFLINE": {
"tableName": "metrics_OFFLINE",
"tableType": "OFFLINE",
"segmentsConfig": {
"schemaName": "metrics",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "730",
"replication": "2",
"timeColumnName": "serve_time",
"allowNullTimeValue": false,
"segmentPushType": "APPEND"
},
"tenants": {
"broker": "DefaultTenant",
"server": "DefaultTenant"
},
"tableIndexConfig": {
"invertedIndexColumns": [],
"noDictionaryColumns": [
"click_count",
"order_count",
"impression_count",
"cost",
"revenue"
],
"rangeIndexColumns": [],
"rangeIndexVersion": 2,
"autoGeneratedInvertedIndex": false,
"createInvertedIndexDuringSegmentGeneration": false,
"sortedColumn": [
"user_id"
],
"bloomFilterColumns": [
"user_id",
"product_id"
],
"loadMode": "MMAP",
"onHeapDictionaryColumns": [],
"varLengthDictionaryColumns": [],
"enableDefaultStarTree": false,
"enableDynamicStarTreeCreation": false,
"segmentPartitionConfig": {
"columnPartitionMap": {
"user_id": {
"functionName": "Murmur",
"numPartitions": 16
}
}
},
"aggregateMetrics": false,
"nullHandlingEnabled": false
},
"metadata": {},
"quota": {},
"routing": {
"segmentPrunerTypes": [
"partition"
]
},
"query": {},
"fieldConfigList": [],
"ingestionConfig": {},
"isDimTable": false
}
}
Luis Fernandez
06/01/2022, 2:54 PMLuis Fernandez
06/01/2022, 2:56 PMDiogo Baeder
06/01/2022, 3:01 PMDiogo Baeder
06/01/2022, 3:02 PMLuis Fernandez
06/01/2022, 3:38 PMLuis Fernandez
06/01/2022, 3:38 PM"exceptions": [],
"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 1423,
"numSegmentsProcessed": 168,
"numSegmentsMatched": 117,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 69212,
"numEntriesScannedInFilter": 1165155303,
"numEntriesScannedPostFilter": 415272,
"numGroupsLimitReached": false,
"totalDocs": 11234231893,
"timeUsedMs": 1464,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numRowsResultSet": 1
Luis Fernandez
06/01/2022, 3:38 PMnumEntriesScannedInFilter
stillLuis Fernandez
06/01/2022, 4:15 PMLuis Fernandez
06/01/2022, 4:39 PM"rangeIndexColumns": [
"serve_time"
],
"rangeIndexVersion": 2,
added thisMayank
Luis Fernandez
06/01/2022, 4:41 PMMayank
Luis Fernandez
06/01/2022, 4:42 PMLuis Fernandez
06/01/2022, 4:42 PMMayank
WHERE user_id = xx AND serve_time BETWEEN 1641013200 AND 1654092017
Mayank
Mayank
1165155303
docsMayank
Luis Fernandez
06/01/2022, 4:45 PM"exceptions": [],
"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 1497,
"numSegmentsProcessed": 168,
"numSegmentsMatched": 117,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 69212,
"numEntriesScannedInFilter": 1165155303,
"numEntriesScannedPostFilter": 415272,
"numGroupsLimitReached": false,
"totalDocs": 11854535291,
"timeUsedMs": 1755,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numRowsResultSet": 25305
Mayank
Luis Fernandez
06/01/2022, 4:45 PM"numEntriesScannedInFilter": 1165155303,
"numEntriesScannedPostFilter": 415272,
Mayank
select count(*) where userId =xxx
? If numEntiresScanedInFilter is not zero then data is not sortedLuis Fernandez
06/01/2022, 4:48 PM"exceptions": [],
"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 1497,
"numSegmentsProcessed": 204,
"numSegmentsMatched": 143,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 83660,
"numEntriesScannedInFilter": 1435776018,
"numEntriesScannedPostFilter": 0,
"numGroupsLimitReached": false,
"totalDocs": 11854535291,
"timeUsedMs": 4000,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numRowsResultSet": 1
Luis Fernandez
06/01/2022, 4:48 PMLuis Fernandez
06/01/2022, 4:49 PM83660
the count(*)
is for that userLuis Fernandez
06/01/2022, 4:51 PM"sortedColumn": [
"user_id"
],
Luis Fernandez
06/01/2022, 4:51 PMMayank
Luis Fernandez
06/01/2022, 5:05 PMLuis Fernandez
06/01/2022, 5:05 PMMayank
Luis Fernandez
06/01/2022, 5:07 PMMayank
Luis Fernandez
06/01/2022, 5:30 PMLuis Fernandez
06/01/2022, 5:31 PM/segments/{tableName}/{segmentName}/metadata
?Luis Fernandez
06/01/2022, 5:59 PMLuis Fernandez
06/01/2022, 5:59 PMuser_id
Mayank
Luis Fernandez
06/01/2022, 7:40 PMLuis Fernandez
06/01/2022, 7:40 PMLuis Fernandez
06/01/2022, 7:40 PMLuis Fernandez
06/01/2022, 7:41 PM{
"tableIndexConfig": {
"sortedColumn": [
"column_name"
],
...
}
}
Mayank
Luis Fernandez
06/01/2022, 7:42 PMMayank
Luis Fernandez
06/01/2022, 7:42 PMLuis Fernandez
06/01/2022, 7:42 PMuser_id
Luis Fernandez
06/01/2022, 7:43 PMMayank
Mayank
Luis Fernandez
06/01/2022, 7:44 PMLuis Fernandez
06/01/2022, 7:44 PM"sortedColumn": [
"user_id"
],
"bloomFilterColumns": [
"user_id",
"product_id"
],
Mayank
Mayank
Luis Fernandez
06/01/2022, 7:44 PMLuis Fernandez
06/01/2022, 7:45 PMMayank
Mayank
Luis Fernandez
06/01/2022, 7:47 PMLuis Fernandez
06/01/2022, 7:49 PMuser_id
Luis Fernandez
06/01/2022, 7:55 PMLuis Fernandez
06/01/2022, 7:55 PMuser_id
Luis Fernandez
06/01/2022, 7:55 PMMayank
Mayank
Mayank
Luis Fernandez
06/02/2022, 2:59 AMLuis Fernandez
06/02/2022, 2:59 AM############ Column(user_id) ############
name: user_id
path: user_id
max_definition_level: 1
max_repetition_level: 0
physical_type: INT64
logical_type: None
converted_type (legacy): NONE
Luis Fernandez
06/02/2022, 3:00 AMLuis Fernandez
06/02/2022, 3:00 AMMayank
Luis Fernandez
06/02/2022, 2:37 PMLuis Fernandez
06/02/2022, 2:37 PM| 1.90185e+07 |
| 1.90211e+07 |
Luis Fernandez
06/02/2022, 2:38 PMLuis Fernandez
06/02/2022, 2:38 PMLuis Fernandez
06/02/2022, 2:40 PMLuis Fernandez
06/02/2022, 2:40 PM.orderBy(imp_click_receipts_dataframe_final("user_id"))
Luis Fernandez
06/02/2022, 2:41 PMMayank
Mayank
Luis Fernandez
06/03/2022, 6:10 PMMayank
Rebecca Lau
06/03/2022, 8:13 PMLuis Fernandez
06/06/2022, 1:14 PMLuis Fernandez
06/06/2022, 1:14 PM"exceptions": [],
"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 11445,
"numSegmentsProcessed": 213,
"numSegmentsMatched": 116,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 68683,
"numEntriesScannedInFilter": 0,
"numEntriesScannedPostFilter": 412098,
"numGroupsLimitReached": false,
"totalDocs": 50128023764,
"timeUsedMs": 581,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numRowsResultSet": 25228
Luis Fernandez
06/06/2022, 1:14 PMLuis Fernandez
06/06/2022, 1:16 PMLuis Fernandez
06/06/2022, 1:39 PMMayank
Luis Fernandez
06/06/2022, 2:21 PMSELECT product_id, SUM(impression_count) as impression_count, SUM(click_count) as click_count, SUM(cost) as spent_total FROM metrics
WHERE user_id = xx AND serve_time BETWEEN 1641013200 AND 1654092017
GROUP BY product_id
LIMIT 100000
Luis Fernandez
06/06/2022, 2:21 PMLuis Fernandez
06/06/2022, 2:21 PMproduct_id
Luis Fernandez
06/06/2022, 2:22 PMMayank
Luis Fernandez
06/06/2022, 2:39 PMLuis Fernandez
06/06/2022, 2:39 PMLuis Fernandez
06/06/2022, 2:43 PMMayank
Luis Fernandez
06/06/2022, 3:27 PMLuis Fernandez
06/06/2022, 3:27 PMLuis Fernandez
06/06/2022, 3:29 PMMayank
Select count(*) from metrics WHERE user_id = xx AND serve_time BETWEEN 1641013200 AND 1654092017
and also the specs of the node on which you are running this?Luis Fernandez
06/06/2022, 4:58 PM"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 11445,
"numSegmentsProcessed": 213,
"numSegmentsMatched": 116,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 68683,
"numEntriesScannedInFilter": 0,
"numEntriesScannedPostFilter": 0,
"numGroupsLimitReached": false,
"totalDocs": 50128023764,
"timeUsedMs": 33,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numRowsResultSet": 1
Luis Fernandez
06/06/2022, 5:00 PMMayank
Luis Fernandez
06/07/2022, 1:36 AMMayank
Luis Fernandez
06/07/2022, 1:19 PMMayank
Mayank
Luis Fernandez
06/07/2022, 1:21 PMMayank
Luis Fernandez
06/07/2022, 1:25 PMLuis Fernandez
06/07/2022, 1:25 PMMayank
where user_id=xxx
, or increase segment size.Luis Fernandez
06/07/2022, 1:37 PMLuis Fernandez
06/07/2022, 1:37 PMLuis Fernandez
06/07/2022, 1:38 PMLuis Fernandez
06/07/2022, 1:38 PMSELECT product_id, SUM(impression_count) as impression_count, SUM(click_count) as click_count, SUM(cost) as spent_total FROM metrics
WHERE user_id = xx AND serve_time BETWEEN 1641013200 AND 1654092017
GROUP BY product_id
LIMIT 100000
Luis Fernandez
06/07/2022, 1:44 PMLuis Fernandez
06/07/2022, 1:46 PMLuis Fernandez
06/07/2022, 1:48 PM"numServersQueried": 2,
"numServersResponded": 2,
"numSegmentsQueried": 4608,
"numSegmentsProcessed": 227,
"numSegmentsMatched": 126,
"numConsumingSegmentsQueried": 0,
"numDocsScanned": 239295,
"numEntriesScannedInFilter": 0,
"numEntriesScannedPostFilter": 1435770,
"numGroupsLimitReached": false,
"totalDocs": 22271206985,
"timeUsedMs": 441,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numRowsResultSet": 682
Luis Fernandez
06/07/2022, 1:49 PMLuis Fernandez
06/07/2022, 1:49 PMMayank
Luis Fernandez
06/07/2022, 1:56 PMMayank
BETWEEN 1641013200 AND 1654092017
Luis Fernandez
06/07/2022, 2:28 PMLuis Fernandez
06/07/2022, 2:28 PMMayank
Luis Fernandez
06/07/2022, 2:30 PMLuis Fernandez
06/07/2022, 2:30 PMMayank
Luis Fernandez
06/08/2022, 6:55 PM