This message was deleted.
# troubleshooting
s
This message was deleted.
a
hello.
UNHEALTHY_TASK
or
UNHEALTHY_SUPERVISOR
could mean a number of different things. Do you see any errors in your supervisor - can you share the overlord logs or relevant stacktraces?
can it be because of resource i provide
Yes, it can be. Have you checked that the supervisor’s tuning config is good? Specifically, does
maxRowsInMemory
and
maxBytesInMemory
look sane wrt the jvm settings? If the supervisor can’t accommodate these thresholds, it’ll OOM and go into an unhealthy state - config and logs /stacktraces would be helpful here.
increased the number of retention period for kafka
What is the configured
taskCount
and
taskDuration
? If the supervisor can’t keep up with the offered load on the input topic that it falls outside the retention period, you might want to tweak the former setting. Understanding your lag patterns can help
a
Hi @Abhinav Raghuvanshi Thanks for your reply !! Till now i haven’t got any issues with OOM ,Please find the below conf
Copy code
{
  "type": "kafka",
  "spec": {
    "dataSchema": {
      "dataSource": "eberspacher_gateway_sensor",
      "timestampSpec": {
        "column": "timestamp",
        "format": "auto",
        "missingValue": null
      },
      "dimensionsSpec": {
        "dimensions": [
          {
            "type": "string",
            "name": "gw_id",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "type",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "double",
            "name": "value",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": false
          }
        ],
        "dimensionExclusions": [
          "__time",
          "count",
          "timestamp"
        ],
        "includeAllDimensions": false
      },
      "metricsSpec": [
        {
          "type": "count",
          "name": "count"
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "MONTH",
        "queryGranularity": {
          "type": "none"
        },
        "rollup": true,
        "intervals": []
      },
      "transformSpec": {
        "filter": null,
        "transforms": []
      }
    },
    "ioConfig": {
      "topic": "eberspacher-gateway-sensor-data-qc",
      "inputFormat": {
        "type": "json",
        "flattenSpec": {
          "useFieldDiscovery": true,
          "fields": []
        },
        "keepNullColumns": true,
        "assumeNewlineDelimited": false,
        "useJsonNodeReader": false
      },
      "replicas": 1,
      "taskCount": 1,
      "taskDuration": "PT604800S",
      "consumerProperties": {
        "bootstrap.servers": "kafka-kafka-bootstrap.kafka.svc.cluster.local:9092,",
        "security.protocol": "SASL_PLAINTEXT",
        "sasl.mechanism": "SCRAM-SHA-512",
        "sasl.jaas.config": "org.apache.kafka.common.security.scram.ScramLoginModule required username='admin-etm-qc' password='xxxxxx';",
        "auto.offset.reset": "earliest"
      },
      "autoScalerConfig": null,
      "pollTimeout": 100,
      "startDelay": "PT5S",
      "period": "PT30S",
      "useEarliestOffset": true,
      "completionTimeout": "PT1800S",
      "lateMessageRejectionPeriod": null,
      "earlyMessageRejectionPeriod": null,
      "lateMessageRejectionStartDateTime": null,
      "configOverrides": null,
      "idleConfig": null,
      "stream": "eberspacher-gateway-sensor-data-qc",
      "useEarliestSequenceNumber": true,
      "type": "kafka"
    },
    "tuningConfig": {
      "type": "kafka",
      "appendableIndexSpec": {
        "type": "onheap",
        "preserveExistingMetrics": false
      },
      "maxRowsInMemory": 1000000,
      "maxBytesInMemory": 0,
      "skipBytesInMemoryOverheadCheck": false,
      "maxRowsPerSegment": 5000000,
      "maxTotalRows": null,
      "intermediatePersistPeriod": "PT10M",
      "maxPendingPersists": 0,
      "indexSpec": {
        "bitmap": {
          "type": "roaring",
          "compressRunOnSerialization": true
        },
        "dimensionCompression": "lz4",
        "stringDictionaryEncoding": {
          "type": "utf8"
        },
        "metricCompression": "lz4",
        "longEncoding": "longs"
      },
      "indexSpecForIntermediatePersists": {
        "bitmap": {
          "type": "roaring",
          "compressRunOnSerialization": true
        },
        "dimensionCompression": "lz4",
        "stringDictionaryEncoding": {
          "type": "utf8"
        },
        "metricCompression": "lz4",
        "longEncoding": "longs"
      },
      "reportParseExceptions": false,
      "handoffConditionTimeout": 0,
      "resetOffsetAutomatically": true,
      "segmentWriteOutMediumFactory": null,
      "workerThreads": null,
      "chatThreads": null,
      "chatRetries": 8,
      "httpTimeout": "PT10S",
      "shutdownTimeout": "PT80S",
      "offsetFetchPeriod": "PT30S",
      "intermediateHandoffPeriod": "P2147483647D",
      "logParseExceptions": false,
      "maxParseExceptions": 2147483647,
      "maxSavedParseExceptions": 0,
      "skipSequenceNumberAvailabilityCheck": false,
      "repartitionTransitionDuration": "PT120S"
    }
  },
  "context": null
}
@Abhinav Raghuvanshi ive maid below mention changes
Copy code
"maxRowsInMemory": 5000000, -> "maxRowsInMemory": 10000000,
"maxBytesInMemory": 0, -> "maxBytesInMemory": 1073741824,{1GiB}
taskCount": 1 -> taskCount": 3
taskDuration": "PT604800S" -> "taskDuration": "PT3600S", {1H}
after applying above conf all my supervisor are getting unhealthy increased the
Copy code
config:
    DRUID_XMX: 2048m
    DRUID_XMS: 2048m
for coordinator and historical node getting below error
Copy code
{
  "dataSource": "eberspacher_gateway_sensor",
  "stream": "eberspacher-gateway-sensor-data-qc",
  "partitions": 3,
  "replicas": 1,
  "durationSeconds": 3600,
  "activeTasks": [
    {
      "id": "index_kafka_eberspacher_gateway_sensor_73f8efb6c3a7c4c_ipefgjof",
      "startingOffsets": {
        "1": 638876
      },
      "startTime": "2023-06-26T10:20:43.717Z",
      "remainingSeconds": 3550,
      "type": "ACTIVE",
      "currentOffsets": {},
      "lag": {}
    },
    {
      "id": "index_kafka_eberspacher_gateway_sensor_2d49d78205e884a_hpnfclcb",
      "startingOffsets": {
        "2": 638231
      },
      "startTime": "2023-06-26T10:20:43.333Z",
      "remainingSeconds": 3549,
      "type": "ACTIVE",
      "currentOffsets": {},
      "lag": {}
    }
  ],
  "publishingTasks": [],
  "latestOffsets": {
    "0": 641043,
    "1": 638988,
    "2": 638364
  },
  "minimumLag": {
    "0": 109,
    "1": 112,
    "2": 133
  },
  "aggregateLag": 354,
  "offsetsLastUpdated": "2023-06-26T10:21:13.461Z",
  "suspended": false,
  "healthy": false,
  "state": "UNHEALTHY_TASKS",
  "detailedState": "UNHEALTHY_TASKS",
  "recentErrors": []
}
a
Do you see any errors in the logs? can you share the overlord logs?
a
Nope we haven’t enabled any logs
a
oops, no logs will make it almost impossible to troubleshoot any server-side issue
a
@Abhinav Raghuvanshi these are my task logs
Copy code
{
  "id": "index_kafka_eber_vehicles_gps_b4db3094603d03e_kfpeaenj",
  "groupId": "index_kafka_eber_vehicles_gps",
  "type": "index_kafka",
  "createdTime": "2023-06-27T06:56:51.968Z",
  "queueInsertionTime": "1970-01-01T00:00:00.000Z",
  "statusCode": "FAILED",
  "status": "FAILED",
  "runnerStatusCode": "WAITING",
  "duration": 73097,
  "location": {
    "host": "10.101.34.67",
    "port": 8101,
    "tlsPort": -1
  },
  "dataSource": "eber_vehicles_gps",
  "errorMsg": "org.apache.druid.java.util.common.ISE: Could not allocate segment for row with timestamp[2023-06-26T..."
if it can help in any case