flink自定义process,使用状态求历史总和(scala)
es idea maven 依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch7_2.11</artifactId>
<version>1.11.1</version>
</dependency>
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch7.{ElasticsearchSink, RestClientFactory}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.flink.util.Collector
import org.apache.http.HttpHost
import org.apache.http.client.config.RequestConfig
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.elasticsearch.action.DocWriteRequest
import org.elasticsearch.client.{Requests, RestClientBuilder}
import java.time.Duration
import java.util.Properties
object Test {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//需要状态开启下面的配置
//env.setStateBackend(new RocksDBStateBackend(s"hdfs://${namenodeID}", true))//hdfs 作为状态后端
//env.enableCheckpointing(10 * 60 * 1000L)
//env.getCheckpointConfig.setCheckpointTimeout(10 * 60 * 1000L)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间
val props = new Properties
props.setProperty("bootstrap.servers", "host:6667") //有些是9092端口
props.setProperty("group.id", "groupId")
props.setProperty("retries", "10")
props.setProperty("retries.backoff.ms", "100")
props.put(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "60000")
//是否配置了权限,有的话加上下面的配置
// props.setProperty("sasl.jaas.config","org.apache.kafka.common.security.plain.PlainLoginModule required username='' password='';")
//props.setProperty("security.protocol", "SASL_PLAINTEXT");
// props.setProperty("sasl.mechanism", "PLAIN")
val httpHosts = new java.util.ArrayList[HttpHost]
httpHosts.add(new HttpHost("esIPOne", 9200, "http"))
httpHosts.add(new HttpHost("esIPTwo", 9200, "http"))
httpHosts.add(new HttpHost("esIPThree", 9200, "http"))
val esSinkBuilder = new ElasticsearchSink.Builder[ResultBean](httpHosts, new ElasticsearchSinkFunction[ResultBean] {
def process(element: ResultBean, ctx: RuntimeContext, indexer: RequestIndexer) {
val json = new java.util.HashMap[String, Any]
json.put("@timestamp", element.ts)
json.put("data", element.data)
json.put("sum", element.sum)
val rqst = Requests.indexRequest()
.index("indexName")
.id(element.id)
.source(json)
.opType(DocWriteRequest.OpType.INDEX)
indexer.add(rqst)
}
})
setESConf(esSinkBuilder, 5000)
val myConsumer = new FlinkKafkaConsumer[DemoBean]("topicName", new DemoKafka(), props)
.setStartFromEarliest() //从什么时间开始读
val source = env
.addSource(myConsumer)
.uid("source-data")
.name("数据源")
.assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness[DemoBean](Duration.ofSeconds(1))
.withTimestampAssigner(new SerializableTimestampAssigner[DemoBean] {
override def extractTimestamp(element: DemoBean, recordTimestamp: Long): Long = element.ts
}).withIdleness(Duration.ofSeconds(5)))
.uid("water-marks")
.name("注册水位线")
source
.keyBy(k => k.id)
.process(new DemoProcess())
.uid("demo-process")
.name("process 示例")
.addSink(esSinkBuilder.build())
.uid("es-sink")
.name("数据写入es")
env.execute("任务名")
}
private class DemoKafka() extends KafkaDeserializationSchema[DemoBean] {
override def isEndOfStream(t: DemoBean): Boolean = false
override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): DemoBean = {
val value = new String(consumerRecord.value())
val list = value.split("\t")
DemoBean(list(0), list(1), list(2).toInt, list(3).toLong)
}
override def getProducedType: TypeInformation[DemoBean] = TypeExtractor.getForClass(classOf[DemoBean])
}
private class DemoProcess extends KeyedProcessFunction[String, DemoBean, ResultBean] {
private var hisSumState: ValueState[Int] = _
override def open(parameters: Configuration): Unit = {
hisSumState = getRuntimeContext.getState(new ValueStateDescriptor("his-sum", classOf[Int]))
}
override def processElement(data: DemoBean, ctx: KeyedProcessFunction[String, DemoBean, ResultBean]#Context, out: Collector[ResultBean]): Unit = {
val his = if (hisSumState.value() == null) 0 else hisSumState.value()
val now = data.value
hisSumState.update(now)
out.collect(ResultBean(data.id, data.data, his + now, data.value))
}
}
def setESConf[T](esSinkBuilder: ElasticsearchSink.Builder[T], numMaxActions: Int) {
esSinkBuilder.setBulkFlushMaxActions(numMaxActions)
esSinkBuilder.setBulkFlushMaxSizeMb(10)
esSinkBuilder.setBulkFlushInterval(10000)
esSinkBuilder.setBulkFlushBackoff(true)
esSinkBuilder.setBulkFlushBackoffDelay(2)
esSinkBuilder.setBulkFlushBackoffRetries(3)
esSinkBuilder.setRestClientFactory(new RestClientFactory {
override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
restClientBuilder.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
override def customizeRequestConfig(requestConfigBuilder: RequestConfig.Builder): RequestConfig.Builder = {
requestConfigBuilder.setConnectTimeout(12000)
requestConfigBuilder.setSocketTimeout(90000)
}
})
}
})
}
private case class DemoBean(id: String, data: String, value: Int, ts: Long)
private case class ResultBean(id: String, data: String, sum: Int, ts: Long)
}