Abhijeet Kushe
07/21/2021, 2:17 PMMayank
Neha Pawar
Abhijeet Kushe
07/21/2021, 2:47 PMRyan Clark
07/21/2021, 3:01 PMNeha Pawar
Abhijeet Kushe
07/21/2021, 3:20 PM{
"tableName": "transcript",
"tableType": "REALTIME",
"segmentsConfig": {
"timeColumnName": "timestampInEpoch",
"timeType": "MILLISECONDS",
"schemaName": "transcript",
"replicasPerPartition": "1"
},
"tenants": {},
"tableIndexConfig": {
"loadMode": "MMAP",
"streamConfigs": {
"streamType": "kinesis",
"stream.kinesis.topic.name": "cdp_metric_events_poc",
"region": "us-east-1",
"endpoint": "<https://kinesis.us-east-1.amazonaws.com>",
"shardIteratorType": "AFTER_SEQUENCE_NUMBER",
"stream.kinesis.consumer.type": "lowlevel",
"stream.kinesis.fetch.timeout.millis": "30000",
"stream.kinesis.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
"stream.kinesis.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kinesis.KinesisConsumerFactory",
"realtime.segment.flush.threshold.size": "1000000",
"realtime.segment.flush.threshold.time": "1s"
}
},
"metadata": {
"customConfigs": {}
}
}
Abhijeet Kushe
07/21/2021, 3:22 PMAbhijeet Kushe
07/21/2021, 3:43 PMAbhijeet Kushe
07/21/2021, 3:44 PMMayank
Abhijeet Kushe
07/21/2021, 3:46 PMNeha Pawar
Abhijeet Kushe
07/21/2021, 5:06 PMAbhijeet Kushe
07/21/2021, 5:08 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
07/21/2021, 5:26 PMNeha Pawar
Abhijeet Kushe
07/21/2021, 5:31 PMAbhijeet Kushe
07/27/2021, 1:11 AM{
"schemaName": "transcript",
"dimensionFieldSpecs": [
{
"name": "studentID",
"dataType": "INT"
},
{
"name": "firstName",
"dataType": "STRING"
},
{
"name": "lastName",
"dataType": "STRING"
},
{
"name": "gender",
"dataType": "STRING"
},
{
"name": "subject",
"dataType": "STRING"
}
],
"primaryKeyColumns": [
"studentID",
"subject"
],
"metricFieldSpecs": [
{
"name": "score",
"dataType": "FLOAT"
}
],
"dateTimeFieldSpecs": [
{
"name": "timestamp",
"dataType": "LONG",
"format": "1:MILLISECONDS:EPOCH",
"granularity": "1:MILLISECONDS"
}
]
}
Table Config
{
"tableName": "transcript",
"tableType": "REALTIME",
"segmentsConfig": {
"timeColumnName": "timestamp",
"timeType": "MILLISECONDS",
"schemaName": "transcript",
"replicasPerPartition": "1"
},
"tenants": {},
"tableIndexConfig": {
"loadMode": "MMAP",
"streamConfigs": {
"streamType": "kinesis",
"stream.kinesis.topic.name": "cdp_metric_events_poc",
"region": "us-east-1",
"shardIteratorType": "AFTER_SEQUENCE_NUMBER",
"stream.kinesis.consumer.type": "lowlevel",
"stream.kinesis.fetch.timeout.millis": "30000",
"stream.kinesis.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
"stream.kinesis.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kinesis.KinesisConsumerFactory",
"realtime.segment.flush.threshold.size": "1000000",
"realtime.segment.flush.threshold.time": "21600" # Also tried 6h did not work
}
},
"upsertConfig": {
"mode": "FULL"
},
"routing": {
"instanceSelectorType": "strictReplicaGroup"
},
"metadata": {
"customConfigs": {}
}
}
Abhijeet Kushe
07/27/2021, 12:26 PMNeha Pawar
Abhijeet Kushe
07/27/2021, 2:22 PMAbhijeet Kushe
07/27/2021, 2:22 PMAbhijeet Kushe
07/27/2021, 2:34 PM"segmentsConfig": {
"timeColumnName": "mtime",
"timeType": "MILLISECONDS",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "1",
"segmentPushType": "APPEND",
"segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy",
"schemaName": "transcript",
"replicasPerPartition": "1"
}
Neha Pawar
Neha Pawar
Abhijeet Kushe
07/27/2021, 3:40 PMNeha Pawar
Neha Pawar
Neha Pawar
Abhijeet Kushe
07/27/2021, 7:31 PMMayank
Mayank
Abhijeet Kushe
07/27/2021, 7:45 PMAbhijeet Kushe
08/02/2021, 1:44 PMAbhijeet Kushe
08/03/2021, 1:09 PMMayank
Abhijeet Kushe
08/03/2021, 4:29 PMNeha Pawar
Abhijeet Kushe
08/05/2021, 5:05 PMAbhijeet Kushe
08/06/2021, 5:55 PMNeha Pawar
Neha Pawar
Neha Pawar
Abhijeet Kushe
08/06/2021, 5:59 PMNeha Pawar
Abhijeet Kushe
08/06/2021, 6:02 PM{
"tableName": "transcript",
"tableType": "REALTIME",
"segmentsConfig": {
"timeColumnName": "timestamp",
"timeType": "MILLISECONDS",
"schemaName": "transcript",
"replicasPerPartition": "1",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "2",
"segmentPushType": "APPEND"
},
"tenants": {},
"tableIndexConfig": {
"loadMode": "MMAP",
"streamConfigs": {
"streamType": "kinesis",
"stream.kinesis.topic.name": "cdp_metric_events_poc",
"region": "us-east-1",
"shardIteratorType": "LATEST",
"stream.kinesis.consumer.type": "lowlevel",
"stream.kinesis.fetch.timeout.millis": "30000",
"stream.kinesis.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
"stream.kinesis.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kinesis.KinesisConsumerFactory",
"realtime.segment.flush.threshold.size": "1000000",
"realtime.segment.flush.threshold.time": "6h"
}
},
"upsertConfig": {
"mode": "FULL"
},
"routing": {
"instanceSelectorType": "strictReplicaGroup"
},
"metadata": {
"customConfigs": {}
},
"task": {
"taskTypeConfigsMap": {
"RealtimeToOfflineSegmentsTask": {
"bucketTimePeriod": "1d",
"bufferTimePeriod": "1h",
"roundBucketTimePeriod": "1h",
"mergeType": "rollup",
"score.aggregationType": "max",
"maxNumRecordsPerSegment": "100000"
}
}
}
}
Offline
{
"tableName": "transcript",
"tableType": "OFFLINE",
"segmentsConfig": {
"replication": 1,
"timeColumnName": "timestamp",
"timeType": "MILLISECONDS",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": 365
},
"tenants": {
"broker": "DefaultTenant",
"server": "DefaultTenant"
},
"tableIndexConfig": {
"loadMode": "MMAP"
},
"metadata": {}
}
Abhijeet Kushe
08/06/2021, 6:03 PMAbhijeet Kushe
08/06/2021, 6:03 PMtasks/scheduler/jobDetails?tableName=transcript_REALTIME&taskType=RealtimeToOfflineSegmentsTask
Abhijeet Kushe
08/06/2021, 6:03 PM{
"code": 404,
"error": "Unable to find job detail for table name - transcript_REALTIME, task type - RealtimeToOfflineSegmentsTask"
}
Neha Pawar
Abhijeet Kushe
08/06/2021, 6:34 PMAbhijeet Kushe
08/06/2021, 6:34 PMAbhijeet Kushe
08/06/2021, 6:34 PMAbhijeet Kushe
08/09/2021, 4:05 PMtasks/RealtimeToOfflineSegmentsTask/state
Abhijeet Kushe
08/09/2021, 4:07 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
08/09/2021, 5:04 PMAbhijeet Kushe
08/09/2021, 5:04 PMNeha Pawar
Abhijeet Kushe
08/09/2021, 6:05 PM2021/08/08 13:36:04.468 INFO [ParticipantHealthReportTask] [Start a Pinot [MINION]] Start HealthCheckInfoReportingTask
2021/08/08 13:36:04.468 INFO [CallbackHandler] [Start a Pinot [MINION]] initializing CallbackHandler: org.apache.helix.manager.zk.CallbackHandler@713b237a content: CallbackHandler{_watchChild=false, _preFetchEnabled=false, _batchModeEnabled=false, _path='/pinot-quickstart/INSTANCES/Minion_pinot-minion-0.pinot-minion-headless.default.svc.cluster.local_9514/MESSAGES', _listener=org.apache.helix.messaging.handling.HelixTaskExecutor@1ae627f3, _changeType=MESSAGE, _manager=org.apache.helix.manager.zk.ZKHelixManager@101067e8, _zkClient=org.apache.helix.manager.zk.ZkClient@
Abhijeet Kushe
08/09/2021, 6:06 PM[Start a Pinot [MINION]] Initialized TaskExecutorFactoryRegistry with 5 task executor factories: [MergeRollupTask, RealtimeToOfflineSegmentsTask, ConvertToRawIndexTask, PurgeTask, SegmentGenerationAndPushTask]
Abhijeet Kushe
08/09/2021, 6:06 PM2021/08/08 13:35:28.072 INFO [TaskFactoryRegistry] [Start a Pinot [MINION]] Registering RealtimeToOfflineSegmentsTask with task executor factory: RealtimeToOfflineSegmentsTaskExecutorFactory, event observer factory: DefaultMinionEventObserverFactory
2021/08/08 13:35:28.072 INFO [TaskFactoryRegistry] [Start a Pinot [MINION]] Registering ConvertToRawIndexTask with task executor factory: ConvertToRawIndexTaskExecutorFactory, event observer factory: DefaultMinionEventObserverFactory
2021/08/08 13:35:28.072 INFO [TaskFactoryRegistry] [Start a Pinot [MINION]] Registering PurgeTask with task executor factory: PurgeTaskExecutorFactory, event observer factory: DefaultMinionEventObserverFactory
2021/08/08 13:35:28.072 INFO [TaskFactoryRegistry] [Start a Pinot [MINION]] Registering SegmentGenerationAndPushTask with task executor factory: SegmentGenerationAndPushTaskExecutorFactory, event observer factory: DefaultMinionEventObserverFactory
Abhijeet Kushe
08/09/2021, 6:13 PM2021/08/09 14:40:42.266 INFO [PeriodicTaskScheduler] [pool-7-thread-9] Starting PinotTaskManager with running frequency of 3600 seconds.
2021/08/09 14:40:42.267 INFO [BasePeriodicTask] [pool-7-thread-9] Start running task: PinotTaskManager
2021/08/09 14:40:42.276 INFO [PinotTaskManager] [pool-7-thread-9] Trying to schedule task type: RealtimeToOfflineSegmentsTask, isLeader: true
2021/08/09 14:40:42.276 INFO [RealtimeToOfflineSegmentsTaskGenerator] [pool-7-thread-9] Start generating task configs for table: transcript_REALTIME for task: RealtimeToOfflineSegmentsTask
2021/08/09 14:40:42.283 INFO [RealtimeToOfflineSegmentsTaskGenerator] [pool-7-thread-9] Window data overflows into CONSUMING segments for partition of segment: transcript__0__1__20210805T2101Z. Skipping task generation: RealtimeToOfflineSegmentsTask
2021/08/09 14:40:42.283 INFO [RealtimeToOfflineSegmentsTaskGenerator] [pool-7-thread-9] Found no eligible segments for task: RealtimeToOfflineSegmentsTask with window [1628380800000 - 1628467200000). Skipping task generation
2021/08/09 14:40:42.283 INFO [PinotTaskManager] [pool-7-thread-9] No task to schedule for task type: RealtimeToOfflineSegmentsTask
2021/08/09 14:40:42.283 INFO [BasePeriodicTask] [pool-7-thread-9] Finish running task: PinotTaskManager in 16ms
2021/08/09 14:41:00.445 INFO [PeriodicTaskScheduler] [pool-7-thread-1] Starting SegmentStatusChecker with running frequency of 300 seconds.
2021/08/09 14:41:00.445 INFO [BasePeriodicTask] [pool-7-thread-1] Start running task: SegmentStatusChecker
2021/08/09 14:41:00.445 INFO [ControllerPeriodicTask] [pool-7-thread-1] Processing 2 tables in task: SegmentStatusChecker
2021/08/09 14:41:00.454 INFO [ControllerPeriodicTask] [pool-7-thread-1] Finish processing 2/2 tables in task: SegmentStatusChecker
2021/08/09 14:41:00.454 INFO [BasePeriodicTask] [pool-7-thread-1] Finish running task: SegmentStatusChecker in 9ms
Abhijeet Kushe
08/09/2021, 6:14 PMAbhijeet Kushe
08/09/2021, 6:16 PMAbhijeet Kushe
08/09/2021, 6:17 PMWindow data overflows into CONSUMING segments
Abhijeet Kushe
08/09/2021, 6:18 PM"segmentsConfig": {
"timeType": "MILLISECONDS",
"schemaName": "transcript",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "2",
"timeColumnName": "timestamp",
"segmentPushType": "APPEND",
"replicasPerPartition": "1"
},
Neha Pawar
Abhijeet Kushe
08/09/2021, 6:45 PMNeha Pawar
Abhijeet Kushe
08/09/2021, 6:48 PM{
"id": "transcript_REALTIME",
"simpleFields": {
"BATCH_MESSAGE_MODE": "false",
"BUCKET_SIZE": "0",
"IDEAL_STATE_MODE": "CUSTOMIZED",
"INSTANCE_GROUP_TAG": "transcript_REALTIME",
"MAX_PARTITIONS_PER_INSTANCE": "1",
"NUM_PARTITIONS": "2",
"REBALANCE_MODE": "CUSTOMIZED",
"REPLICAS": "1",
"STATE_MODEL_DEF_REF": "SegmentOnlineOfflineStateModel",
"STATE_MODEL_FACTORY_NAME": "DEFAULT"
},
"mapFields": {
"transcript__0__1__20210805T2101Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"transcript__0__2__20210808T1446Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "CONSUMING"
}
},
"listFields": {}
}
Neha Pawar
select $segmentName, min(\"timestamp\"), max(\"timestamp\") from transcript_REALTIME group by $segmentName
can you run this? This will tell you the time ranges in your segments. You have set bucketTime=1d. So the job will pick the least time and do least time + 1 day, as the bucket. If this bucket overlaps with the CONSUMING segment, that means it doesnt not have complete data for the bucket, and it will waitNeha Pawar
Abhijeet Kushe
08/09/2021, 6:55 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
08/09/2021, 7:08 PMAbhijeet Kushe
08/09/2021, 7:08 PMAbhijeet Kushe
08/09/2021, 7:09 PMNeha Pawar
1628431856000
, and because you’ve set bucketTime=1d, the window will be [1628431856000, 1628431856000+1d]. It will find all completed segments which have data in this window. In your case this is transcript__0__1. But even the CONSUMING segment has data for this window. So we dont have all segments for this window at this time.
https://docs.pinot.apache.org/operators/operating-pinot/pinot-managed-offline-flows#how-this-worksNeha Pawar
Abhijeet Kushe
08/09/2021, 7:29 PMAbhijeet Kushe
08/09/2021, 7:29 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
08/09/2021, 8:38 PMNeha Pawar
Abhijeet Kushe
08/09/2021, 8:39 PMAbhijeet Kushe
08/09/2021, 8:39 PMNeha Pawar
Abhijeet Kushe
08/09/2021, 8:46 PMAbhijeet Kushe
08/09/2021, 8:46 PMNeha Pawar
Abhijeet Kushe
08/09/2021, 11:51 PMNeha Pawar
Abhijeet Kushe
08/10/2021, 12:48 AMAbhijeet Kushe
08/10/2021, 1:34 PMNeha Pawar
Abhijeet Kushe
08/10/2021, 8:33 PMINFO [LLRealtimeSegmentDataManager_metrics__1__0__20210810T1937Z] [metrics__1__0__20210810T1937Z] Consumed 0 events from (rate:0.0/s), currentOffset={"shardId-000000000001":"49620943664941832421991682937699078913900748813098811410"}, numRowsConsumedSoFar=142679, numRowsIndexedSoFar=0
2021/08/10 20:25:18.748 WARN [KinesisConsumer] [pool-2263-thread-1]
Abhijeet Kushe
08/10/2021, 9:04 PMNeha Pawar
Abhijeet Kushe
08/11/2021, 1:46 PMAbhijeet Kushe
08/11/2021, 2:00 PMAbhijeet Kushe
08/11/2021, 2:10 PMselect count(*) from transcript_OFFLINE
I get 12
(Because the offline segment has not been generated I will have to send a terminal event for the segment)
But the confusing part is when I query select count(*) from transcript
I get 1
as the output and not 13
Assumption is that simply running the query against transcript will combine the output of both REALTIME and OFFLINE but apparently I am only seeing transcript_REALTIME
outputNeha Pawar
Abhijeet Kushe
08/11/2021, 3:34 PMIn case of hybrid tables, the brokers ensure that the overlap between realtime and offline segment data is queried exactly once, by performing offline and realtime federation
"""""
The broker then merges results from both these queries before returning back to the client.
Neha Pawar
Abhijeet Kushe
08/11/2021, 5:38 PMNeha Pawar
Abhijeet Kushe
08/11/2021, 6:41 PMAbhijeet Kushe
08/11/2021, 6:41 PMAbhijeet Kushe
08/11/2021, 6:41 PMAbhijeet Kushe
08/11/2021, 6:42 PMNeha Pawar
Neha Pawar
Neha Pawar
day 1, day 2, day 3, day 4, day 5….day 20
. And the realtime table will have only few days, say, day 18, day 19, day 20, day 21
.
In such a case, you dont want to be counting day 18, day 19 and day 20
twice.
So we will look at the latest timestamp in offline (in this case day 20), and a query like select * from table
will become, select * from table_OFFLINE where timestamp < day 20
and select * from table_REALTIME where timestamp >= day 20
Abhijeet Kushe
08/11/2021, 6:57 PMNeha Pawar
select timestamp from transcript_OFFLINE
and select timestamp from transcript_REALTIME
, and then observe broker logs when you run select * from transcript
Abhijeet Kushe
08/11/2021, 9:19 PMselect count(*) FROM transcript
but I see 1
select count(*) FROM transcript
count(*)
1
select $segmentName, min("timestamp"), max("timestamp") from transcript_REALTIME group by $segmentName
$segmentName min(timestamp) max(timestamp)
transcript__0__2__20210808T1446Z 1628552862000 1628552862000
select count(*) FROM transcript_REALTIME WHERE "timestamp" >= 1628552862000
count(*)
1
select count(*) FROM transcript_OFFLINE WHERE "timestamp" < 1628552862000
count(*)
12
Neha Pawar
select $segmentName, min("timestamp"), max("timestamp") from transcript_OFFLINE group by $segmentName
?Abhijeet Kushe
08/12/2021, 12:04 AMtranscript_1628431200000_1628431200000_0 1628431200000 1628431200000
Neha Pawar
Neha Pawar
Neha Pawar
1628431200000
. So the 2 queries are going to be
select * from table_OFFLINE where timestamp <= (1628431200000 - 1d)
and
select * from table_REALTIME where timestamp > (1628431200000 - 1d)
. Refer to example above and also that doc picture, which both show that the latest day is always favored from realtime, because you might still be receiving events for that day.
the 1d is determined by the segmentPushFrequency setting in the table config (DAILY/HOURLY).Abhijeet Kushe
08/12/2021, 1:12 AMNeha Pawar
Abhijeet Kushe
08/17/2021, 9:41 PM2021/08/17 19:41:05.170 INFO [RealtimeToOfflineSegmentsTaskGenerator] [pool-7-thread-3] Partitions: [0] have no completed segments. Table: metrics_REALTIME is not ready for RealtimeToOfflineSegmentsTask. Skipping task generation.
2021/08/17 20:41:05.190 INFO [RealtimeToOfflineSegmentsTaskGenerator] [pool-7-thread-4] Start generating task configs for table: metrics_REALTIME for task: RealtimeToOfflineSegmentsTask
2021/08/17 20:41:05.204 INFO [RealtimeToOfflineSegmentsTaskGenerator] [pool-7-thread-4] Partitions: [0] have no completed segments. Table: metrics_REALTIME is not ready for RealtimeToOfflineSegmentsTask. Skipping task generation.
External view in the Zookeeper browser has this info
"metrics__0__0__20210813T1710Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "CONSUMING"
},
"metrics__1__10__20210816T0511Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__11__20210816T1112Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__12__20210816T1712Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__13__20210816T2313Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__14__20210817T0513Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__15__20210817T1113Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__16__20210817T1713Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "CONSUMING"
},
"metrics__1__8__20210815T1711Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
},
"metrics__1__9__20210815T2311Z": {
"Server_pinot-server-0.pinot-server-headless.default.svc.cluster.local_8098": "ONLINE"
}
Any ideas ?Neha Pawar
Partitions: [0] have no completed segments. Table: metrics_REALTIME is not ready for RealtimeToOfflineSegmentsTask. Skipping task generation.
Neha Pawar
Abhijeet Kushe
08/18/2021, 5:33 PMAbhijeet Kushe
08/18/2021, 5:34 PMNeha Pawar
Abhijeet Kushe
08/18/2021, 5:35 PMAbhijeet Kushe
08/18/2021, 5:36 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
08/18/2021, 5:42 PMAbhijeet Kushe
08/19/2021, 5:22 PMNeha Pawar
Abhijeet Kushe
08/19/2021, 7:13 PMAbhijeet Kushe
08/19/2021, 7:14 PMNeha Pawar
Abhijeet Kushe
08/19/2021, 7:17 PMAbhijeet Kushe
08/31/2021, 1:52 AMaccount_id type task_name task_state timestamp workflow_id
7119049663464 task test_task_0 running 1630361707931 f4b91f52-16f2-4a2c-8865-bf5039ac4643
7119049663464 task test_task_2 completed 1630361712931 f4b91f52-16f2-4a2c-8865-bf5039ac4643
7119049663464 task test_task_1 skipped 1630361717931 f4b91f52-16f2-4a2c-8865-bf5039ac4643
7119049663464 task test_task_0 timeout 1630361722931 f4b91f52-16f2-4a2c-8865-bf5039ac4643
7119049663464 task test_task_2 failed 1630361727931 f4b91f52-16f2-4a2c-8865-bf5039ac4643
expected count for each task
test_task_0 1
test_task_1 1
test_task_2 1
but we get
test_task_1 1
test_task_2 1
test_task_0 1
test_task_1 1
test_task_2 1
Abhijeet Kushe
08/31/2021, 1:54 AMMayank
Abhijeet Kushe
08/31/2021, 12:21 PMMayank
Abhijeet Kushe
09/09/2021, 2:45 PMNeha Pawar
Abhijeet Kushe
09/10/2021, 4:28 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
09/10/2021, 5:01 PMAbhijeet Kushe
09/10/2021, 5:03 PMNeha Pawar
Abhijeet Kushe
09/10/2021, 5:06 PMNeha Pawar
Jackie
09/10/2021, 6:48 PMJackie
09/10/2021, 6:49 PMAbhijeet Kushe
09/10/2021, 9:05 PMControl the *number of records you want in a segment generated*. Useful if the time window has many records, but you don't want them all in the same segment.
This means that it is time window or max number of rows which ever happens first.I was referring to compacting across time windows which u answered I will hae to delete the segments first before replacingNeha Pawar
Abhijeet Kushe
09/13/2021, 10:30 PMJackie
09/13/2021, 10:40 PMmaxNumRecordsPerSegment
might be too small. I'd recommend at least 1M (it is 5M by default), or the task might generate too many segments per dayAbhijeet Kushe
09/13/2021, 10:42 PMAbhijeet Kushe
09/13/2021, 10:42 PMJackie
09/13/2021, 10:44 PMAbhijeet Kushe
09/13/2021, 10:44 PMAbhijeet Kushe
10/07/2021, 8:25 PM{
"tableName": "workflowEvents",
"tableType": "REALTIME",
"segmentsConfig": {
"timeColumnName": "eventTimestamp",
"timeType": "MILLISECONDS",
"schemaName": "workflowEvents",
"replicasPerPartition": "1",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "10",
"segmentPushType": "APPEND"
},
"tenants": {
"broker": "DefaultTenant",
"server": "DefaultTenant"
},
"tableIndexConfig": {
"loadMode": "MMAP",
"streamConfigs": {
"streamType": "kinesis",
"stream.kinesis.topic.name": "qa-rel-metrics-stream",
"region": "us-east-1",
"shardIteratorType": "LATEST",
"stream.kinesis.consumer.type": "lowlevel",
"stream.kinesis.fetch.timeout.millis": "30000",
"stream.kinesis.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
"stream.kinesis.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kinesis.KinesisConsumerFactory",
"realtime.segment.flush.threshold.size": "1000000",
"realtime.segment.flush.threshold.time": "6h"
}
},
"upsertConfig": {
"mode": "FULL"
},
"routing": {
"instanceSelectorType": "strictReplicaGroup"
},
"metadata": {
"customConfigs": {}
},
"task": {
"taskTypeConfigsMap": {
"RealtimeToOfflineSegmentsTask": {
"bucketTimePeriod": "7d",
"bufferTimePeriod": "1d",
"roundBucketTimePeriod": "1h",
"mergeType": "rollup",
"score.aggregationType": "max",
"maxNumRecordsPerSegment": "5000000"
}
}
}
}
Abhijeet Kushe
10/07/2021, 8:25 PM{
"tableName": "workflowEvents",
"tableType": "OFFLINE",
"segmentsConfig": {
"replication": 1,
"timeColumnName": "eventTimestamp",
"timeType": "MILLISECONDS",
"retentionTimeUnit": "DAYS",
"retentionTimeValue": 365
},
"tenants": {
"broker": "DefaultTenant",
"server": "DefaultTenant"
},
"tableIndexConfig": {
"loadMode": "MMAP"
},
"ingestionConfig": {
"batchIngestionConfig": {
"segmentIngestionType": "APPEND",
"segmentIngestionFrequency": "DAILY"
}
},
"metadata": {}
}
Abhijeet Kushe
10/07/2021, 8:27 PMAbhijeet Kushe
10/07/2021, 8:28 PMAbhijeet Kushe
10/07/2021, 8:30 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
10/07/2021, 9:42 PMAbhijeet Kushe
10/07/2021, 9:43 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:49 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:50 PM{
"id": "workflowEvents_REALTIME",
"simpleFields": {
"BATCH_MESSAGE_MODE": "false",
"IDEAL_STATE_MODE": "CUSTOMIZED",
"INSTANCE_GROUP_TAG": "workflowEvents_REALTIME",
"MAX_PARTITIONS_PER_INSTANCE": "1",
"NUM_PARTITIONS": "39",
"REBALANCE_MODE": "CUSTOMIZED",
"REPLICAS": "1",
"STATE_MODEL_DEF_REF": "SegmentOnlineOfflineStateModel",
"STATE_MODEL_FACTORY_NAME": "DEFAULT"
},
"mapFields": {
"workflowEvents__0__0__20210927T1932Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__10__20211004T1923Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__11__20211005T0123Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__12__20211005T1023Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__13__20211005T1624Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__14__20211005T2224Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__15__20211006T0424Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__16__20211006T1524Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__17__20211006T2125Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__18__20211007T0325Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__19__20211007T1725Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "CONSUMING"
},
"workflowEvents__0__1__20210930T1921Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__2__20211001T0121Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__3__20211001T0722Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__4__20211001T1322Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__5__20211001T1922Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__6__20211002T0122Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__7__20211002T1122Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__8__20211002T1722Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__0__9__20211004T1323Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__0__20210927T1932Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__10__20211004T1923Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__11__20211005T0123Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__12__20211005T1424Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__13__20211005T2024Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__14__20211006T0225Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__15__20211006T1425Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__16__20211006T2025Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__17__20211007T0225Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__18__20211007T1626Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "CONSUMING"
},
"workflowEvents__1__1__20210930T1921Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__2__20211001T0121Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__3__20211001T0722Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__4__20211001T1322Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__5__20211001T1922Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__6__20211002T0122Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__7__20211002T1122Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__8__20211002T1722Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
},
"workflowEvents__1__9__20211004T1323Z": {
"Server_cdp-dl-pinot-k8s-server-0.cdp-dl-pinot-k8s-server-headless.dp-metrics-pinot.svc.cluster.local_8098": "ONLINE"
}
},
"listFields": {}
}
Neha Pawar
Abhijeet Kushe
10/07/2021, 9:52 PMAbhijeet Kushe
10/07/2021, 9:52 PMAbhijeet Kushe
10/07/2021, 9:53 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:54 PMAbhijeet Kushe
10/07/2021, 9:54 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:54 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:56 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:57 PMAbhijeet Kushe
10/07/2021, 9:57 PMAbhijeet Kushe
10/07/2021, 9:57 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 9:59 PMAbhijeet Kushe
10/07/2021, 9:59 PMAbhijeet Kushe
10/07/2021, 10:01 PMAbhijeet Kushe
10/07/2021, 10:02 PMNeha Pawar
Neha Pawar
Abhijeet Kushe
10/07/2021, 10:04 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 10:04 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 10:06 PMAbhijeet Kushe
10/07/2021, 10:08 PMNeha Pawar
Neha Pawar
AT_SEQUENCE_NUMBER
and then restart again?Abhijeet Kushe
10/07/2021, 10:11 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 10:14 PMNeha Pawar
Neha Pawar
Neha Pawar
Abhijeet Kushe
10/07/2021, 10:21 PMAbhijeet Kushe
10/07/2021, 10:22 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 10:30 PMAbhijeet Kushe
10/07/2021, 10:31 PMNeha Pawar
Abhijeet Kushe
10/07/2021, 10:33 PMAbhijeet Kushe
10/08/2021, 5:43 PMNo job to purge for the queue TaskQueue_RealtimeToOfflineSegmentsTask
Abhijeet Kushe
10/08/2021, 5:44 PMAbhijeet Kushe
10/08/2021, 7:30 PMAbhijeet Kushe
10/11/2021, 1:07 PMMayank
Neha Pawar
Neha Pawar
Mayank
Abhijeet Kushe
10/11/2021, 6:08 PMNeha Pawar
Abhijeet Kushe
10/11/2021, 6:10 PMNeha Pawar
Abhijeet Kushe
10/22/2021, 2:45 PMNeha Pawar
Abhijeet Kushe
10/23/2021, 12:59 AMNeha Pawar
Abhijeet Kushe
10/25/2021, 5:10 PMNeha Pawar
Abhijeet Kushe
10/25/2021, 5:12 PMAbhijeet Kushe
10/25/2021, 5:13 PM