-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
402f187
commit 78e70ed
Showing
8 changed files
with
446 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# The minimum required config options for loading from kafka | ||
{ | ||
"input": { | ||
"type": "kafka" | ||
"brokers": "localhost:9092" | ||
"topicName":"enriched-events" | ||
"retries": 10 | ||
"groupId": "kafka-consumer" | ||
"buffer": { | ||
"recordLimit": 500 | ||
} | ||
} | ||
"output": { | ||
"good": { | ||
"client": { | ||
"endpoint": "localhost" | ||
} | ||
"cluster": { | ||
"index": "good" | ||
} | ||
} | ||
"bad" { | ||
"type": "kafka" | ||
"brokers": "localhost:9092" | ||
"topicName":"bad-events" | ||
|
||
"retries": 10 | ||
"groupId": "kafka-consumer" | ||
"buffer": { | ||
"byteLimit" : 4500000 | ||
"timeLimit" : 500 | ||
} | ||
} | ||
} | ||
"purpose": "JSON" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
{ | ||
"input": { | ||
# Sources currently supported are: | ||
# "kinesis" for reading records from a Kinesis stream | ||
# "stdin" for reading unencoded tab-separated events from stdin | ||
# If set to "stdin", JSON documents will not be sent to Elasticsearch | ||
# but will be written to stdout. | ||
# "nsq" for reading unencoded tab-separated events from NSQ | ||
"type": "kafka" | ||
|
||
# Topic name for incoming data | ||
"topicName": "enriched-events" | ||
|
||
|
||
# Kafka brokers | ||
"brokers": "localhost:9092" | ||
|
||
# Events are accumulated in a buffer before being sent to Elasticsearch. | ||
# The buffer is emptied whenever the number of stored records exceeds recordLimit | ||
"buffer": { | ||
"recordLimit": 499 # Default value 500 | ||
} | ||
} | ||
|
||
"output": { | ||
"good": { | ||
# Good sinks currently supported are: | ||
# "elasticsearch" for writing good records to Elasticsearch | ||
# "stdout" for writing good records to stdout | ||
# Default value "elasticsearch" | ||
"type": "elasticsearch" | ||
|
||
# Events are indexed using an Elasticsearch Client | ||
# - endpoint: the cluster endpoint | ||
# - port (optional, default value 9200): the port the cluster can be accessed on | ||
# - for http this is usually 9200 | ||
# - for transport this is usually 9300 | ||
# - username (optional, remove if not active): http basic auth username | ||
# - password (optional, remove if not active): http basic auth password | ||
# - shardDateFormat (optional, remove if not needed): formatting used for sharding good stream, i.e. _yyyy-MM-dd | ||
# - shardDateField (optional, if not specified derived_tstamp is used): timestamp field for sharding good stream | ||
# - maxTimeout: the maximum attempt time before a client restart | ||
# - maxRetries (optional, default value 6): the maximum number of request attempts before giving up | ||
# - ssl (optional, default value false): if using the http client, whether to use ssl or not | ||
"client": { | ||
"endpoint": "localhost" | ||
"port": 9200 | ||
"username": "es-user" | ||
"password": "es-pass" | ||
"shardDateFormat": "_yyyy-MM-dd" | ||
"shardDateField": "derived_tstamp" | ||
"maxTimeout": 9999 | ||
"maxRetries": 5 | ||
"ssl": true | ||
} | ||
|
||
# When using the AWS ES service | ||
# - signing: if using the http client and the AWS ES service you can sign your requests | ||
# http://docs.aws.amazon.com/general/latest/gr/signing_aws_api_requests.html | ||
# - region where the AWS ES service is located | ||
# These values are optional. | ||
"aws": { | ||
"signing": true # Default value false | ||
"region": "eu-central-1" # Default value empty string | ||
} | ||
|
||
"cluster": { | ||
# The Elasticsearch index name | ||
# Default value "good" | ||
"index": "good" | ||
# The Elasticsearch index type. | ||
# Index types are deprecated in ES >=7.x | ||
# Therefore, it shouldn't be set with ES >=7.x | ||
"documentType": "good-doc" | ||
} | ||
|
||
# Bulk request to Elasticsearch will be splitted to | ||
# chunks according given limits. | ||
# These values are optional. | ||
"chunk": { | ||
"byteLimit": 999999 # Default value is 1000000 | ||
"recordLimit": 499 # Default value is 500 | ||
} | ||
} | ||
"bad" { | ||
# Bad sinks currently supported are: | ||
# "kinesis" for writing bad records to Kinesis | ||
# "stderr" for writing bad records to stderr | ||
# "nsq" for writing bad records to NSQ | ||
# "kafka" for writing bad records to Kafka | ||
# "none" for ignoring bad records | ||
"type": "kafka" | ||
|
||
# Topic name for events which are rejected by Elasticsearch | ||
"topicName": "bad-events" | ||
|
||
# brokers name for kafka | ||
"brokers": "localhost:9092" | ||
|
||
# Events are accumulated in a buffer before being sent to Kafka. | ||
# The buffer is emptied whenever: | ||
# - the combined size of the stored records reaches byteLimit or | ||
# - the time in milliseconds since it was last emptied exceeds timeLimit when | ||
# a new event enters the buffer | ||
buffer { | ||
byteLimit = 50000 | ||
timeLimit = 5000 | ||
} | ||
} | ||
} | ||
|
||
# "ENRICHED_EVENTS" for a stream of successfully enriched events | ||
# "BAD_ROWS" for a stream of bad events | ||
# "JSON" for writing plain json | ||
"purpose": "ENRICHED_EVENTS" | ||
|
||
# Optional section for tracking endpoints | ||
"monitoring": { | ||
"snowplow": { | ||
"collector": "localhost:14322" | ||
"appId": "test-app-id" | ||
} | ||
|
||
"metrics": { | ||
# Optional, cloudwatch metrics are enabled by default. | ||
"cloudWatch": false | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
137 changes: 137 additions & 0 deletions
137
core/src/main/scala/com.snowplowanalytics.stream/loader/executors/KafkaSourceExecutor.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/** | ||
* Copyright (c) 2014-2021 Snowplow Analytics Ltd. | ||
* All rights reserved. | ||
* | ||
* This program is licensed to you under the Apache License Version 2.0, | ||
* and you may not use this file except in compliance with the Apache | ||
* License Version 2.0. | ||
* You may obtain a copy of the Apache License Version 2.0 at | ||
* http://www.apache.org/licenses/LICENSE-2.0. | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the Apache License Version 2.0 is distributed | ||
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | ||
* either express or implied. | ||
* | ||
* See the Apache License Version 2.0 for the specific language | ||
* governing permissions and limitations there under. | ||
*/ | ||
package com.snowplowanalytics.stream.loader | ||
package executors | ||
|
||
// Kafka | ||
import org.apache.kafka.clients.consumer.KafkaConsumer | ||
|
||
import java.nio.charset.StandardCharsets.UTF_8 | ||
|
||
//Java | ||
import java.time.Duration | ||
import java.util | ||
import java.util.Properties | ||
|
||
// Scala | ||
import scala.collection.mutable.ListBuffer | ||
import scala.collection.JavaConverters._ | ||
|
||
// Logging | ||
import org.slf4j.LoggerFactory | ||
|
||
// This project | ||
import com.snowplowanalytics.stream.loader.Config._ | ||
import com.snowplowanalytics.stream.loader.clients._ | ||
import com.snowplowanalytics.stream.loader.sinks._ | ||
import com.snowplowanalytics.stream.loader.transformers.{ | ||
BadEventTransformer, | ||
EnrichedEventJsonTransformer, | ||
JsonTransformer | ||
} | ||
|
||
/** | ||
* NSQSource executor | ||
* | ||
* @param purpose kind of data stored, good, bad or plain-json | ||
* @param kafka Kafka KafkaConfig | ||
* @param goodSink the configured GoodSink | ||
* @param badSink the configured BadSink | ||
*/ | ||
class KafkaSourceExecutor( | ||
purpose: Purpose, | ||
kafka: Source.Kafka, | ||
goodSink: Either[ISink, BulkSender[EmitterJsonInput]], | ||
badSink: ISink, | ||
shardDateField: Option[String], | ||
shardDateFormat: Option[String] | ||
) extends Runnable { | ||
|
||
private lazy val log = LoggerFactory.getLogger(getClass) | ||
|
||
// nsq messages will be buffered in msgBuffer until buffer size become equal to nsqBufferSize | ||
private val msgBuffer = new ListBuffer[EmitterJsonInput]() | ||
// ElasticsearchEmitter instance | ||
private val emitter = | ||
new Emitter( | ||
goodSink, | ||
badSink | ||
) | ||
private val transformer = purpose match { | ||
case Purpose.Enriched => new EnrichedEventJsonTransformer(shardDateField, shardDateFormat) | ||
case Purpose.Json => new JsonTransformer | ||
case Purpose.Bad => new BadEventTransformer | ||
} | ||
|
||
/** | ||
* Creates a new Kafka Producer with the given | ||
* configuration options | ||
* | ||
* @return a new Kafka Producer | ||
*/ | ||
private def createConsumer: KafkaConsumer[String, Array[Byte]] = { | ||
|
||
log.info(s"Connect Kafka Consumer to brokers: ${kafka.brokers}") | ||
|
||
val props = new Properties() | ||
props.put("bootstrap.servers", kafka.brokers) | ||
props.put("group.id", kafka.groupId) | ||
props.put("enable.auto.commit", "true") | ||
props.put("auto.commit.interval.ms", "1000") | ||
props.put("auto.offset.reset", "earliest") | ||
props.put("session.timeout.ms", "30000") | ||
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") | ||
props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer") | ||
|
||
kafka.sourceConf.getOrElse(Map()).foreach { case (k, v) => props.setProperty(k, v) } | ||
|
||
new KafkaConsumer[String, Array[Byte]](props) | ||
} | ||
|
||
/** Never-ending processing loop over source stream. */ | ||
override def run(): Unit = { | ||
log.info(s"Running Kafka consumer group: ${kafka.groupId}.") | ||
log.info(s"Processing raw input Kafka topic: ${kafka.topicName}") | ||
val kafkaBufferSize = kafka.buffer.recordLimit | ||
val consumer = createConsumer | ||
|
||
consumer.subscribe(util.Collections.singletonList(kafka.topicName)) | ||
|
||
while (true) { | ||
val recordValues = consumer | ||
.poll(Duration.ofMillis(100)) // Wait 100 ms if data is not available | ||
.asScala | ||
.toList | ||
.map(_.value) | ||
msgBuffer.synchronized { | ||
for (record <- recordValues) { | ||
val msgStr = new String(record, UTF_8) | ||
val emitterInput = transformer.consumeLine(msgStr) | ||
msgBuffer += emitterInput | ||
|
||
if (msgBuffer.size == kafkaBufferSize) { | ||
val rejectedRecords = emitter.attemptEmit(msgBuffer.toList) | ||
emitter.fail(rejectedRecords.asJava) | ||
msgBuffer.clear() | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.