- 参考
- 需求背景
- 模拟场景
- pom文件
- code
- 自定义数据源
- 业务代码
- 自定义反序列化
- table_sql
- 运行效果截图
- 未识别的 calzz_id 为5
- 添加 clazz_id 为5的 clazz 信息,然后删除。
- Mysql 语句。
https://ververica.github.io/flink-cdc-connectors/release-2.1/content/connectors/mysql-cdc.html
需求背景无论是实时还是离线场景,都离不开维表的关联。 痛点: 维表更新 ·离线处理还好,更新之后重新处理一下我们的业务逻辑即可。 ·但是流式/实时场景中,码表的更新,意味着有关联不上的数据,cdc诞生之前,我们可能会将未匹配到的数据写到‘未识别’的分区中,再跑一次离线任务关联未匹配到的数据,但是这种处理对于实时是很不友好的。模拟场景
现有流式数据 student(id, name, age, clazz_id, clazz_name(需要匹配关联))。 码表存储在Mysql clazz_table中。 用DataStream方式处理。 ·关联出 clazz_name 。 ·增删 clazz_table 模拟码表更新。 ·将结果输出(验证阶段 print)。pom文件
多余的自己过滤
code 自定义数据源UTF-8 1.8 1.8 1.13.1 2.11 2.1.1-cdh6.1.1 3.0.0-cdh6.1.1 2.8.2 1.2.7 cloudera https://repository.cloudera.com/artifactory/cloudera-repos/ org.apache.commons commons-compress 1.19 com.alibaba fastjson ${fastjson.version} commons-cli commons-cli 1.4 org.apache.flink flink-streaming-scala_${scala.version} ${flink.version} org.apache.flink flink-table-api-scala-bridge_2.11 ${flink.version} org.apache.flink flink-table-api-java-bridge_2.11 ${flink.version} org.apache.flink flink-clients_${scala.version} ${flink.version} commons-compress org.apache.commons org.apache.flink flink-table-planner-blink_2.11 ${flink.version} org.apache.flink flink-table-runtime-blink_${scala.version} ${flink.version} com.datamountaineer kafka-connect-common 1.1.7 org.apache.kafka kafka-clients 2.8.0 org.apache.flink flink-table-common ${flink.version} org.apache.flink flink-sql-connector-kafka_${scala.version} ${flink.version} org.apache.flink flink-connector-hbase-2.2_${scala.version} ${flink.version} org.apache.flink flink-json ${flink.version} org.apache.flink flink-connector-hive_2.11 ${flink.version} org.apache.hive hive-exec ${hive.version} calcite-avatica org.apache.calcite calcite-core org.apache.calcite calcite-linq4j org.apache.calcite commons-compress org.apache.commons org.apache.hadoop hadoop-client ${hadoop.version} provided commons-compress org.apache.commons org.apache.flink flink-parquet_2.11 ${flink.version} org.apache.flink flink-orc_2.11 ${flink.version} org.apache.flink flink-connector-kafka_${scala.version} ${flink.version} org.apache.flink flink-connector-base ${flink.version} org.projectlombok lombok 1.16.18 org.apache.logging.log4j log4j-slf4j-impl ${log4j.version} runtime org.apache.logging.log4j log4j-api ${log4j.version} runtime org.apache.logging.log4j log4j-core ${log4j.version} runtime mysql mysql-connector-java 8.0.16 org.apache.flink flink-connector-jdbc_2.11 ${flink.version} com.ververica flink-connector-mysql-cdc 2.1.0
@Data
@NoArgsConstructor
@AllArgsConstructor
@ToString
public class Student implements Serializable {
private String id;
private String name;
private String age;
private String clazz;
private String clazzName;
}
业务代码
数据量小,码表存储在广播变量里, 数据量大可以考虑三级缓存。
import com.alibaba.fastjson.JSONObject;
import com.gwm.entity.Student;
import com.gwm.utils.MyDeseUtil;
import com.gwm.utils.StudentSource;
import com.ververica.cdc.connectors.mysql.source.MySqlSource;
import com.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;
import java.util.Properties;
public class FlinkCDCMysql {
public static void main(String[] args) throws Exception{
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(3000);
final DataStreamSource studentDS = env.addSource(new StudentSource());
Properties debeziumProperties = new Properties();
debeziumProperties.put("snapshot.locking.mode", "none");
final MySqlSource mySqlSource = MySqlSource.builder()
.hostname("")
.port(3306)
.databaseList("test") // set captured database
.tableList("test.clazz_table") // set captured table
.username("root")
.password("123456")
// .deserializer(new StringDebeziumDeserializationSchema()) // converts SourceRecord to String
.deserializer(new MyDeseUtil()) // converts SourceRecord to String
.debeziumProperties(debeziumProperties)
.build();
final DataStreamSource dimSource = env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "mysql_source");
final BroadcastStream dimBc = dimSource.broadcast(new MapStateDescriptor("dim_broadcast", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO));
final SingleOutputStreamOperator processDs = studentDS.connect(dimBc).process(new BroadcastProcessFunction() {
MapStateDescriptor mapStateDescriptor ;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
if(mapStateDescriptor == null){
mapStateDescriptor = new MapStateDescriptor("dim_broadcast", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
}
}
@Override
public void processElement(Student student, ReadOnlyContext ctx, Collector out) throws Exception {
final ReadOnlyBroadcastState broadcastState = ctx.getBroadcastState(mapStateDescriptor);
if (broadcastState.contains(student.getClazz())) {
student.setClazzName((String) broadcastState.get(student.getClazz()));
}else{
student.setClazzName("未识别");
}
out.collect(student);
}
@Override
public void processBroadcastElement(String value, Context ctx, Collector out) throws Exception {
final BroadcastState broadcastState = ctx.getBroadcastState(mapStateDescriptor);
final JSONObject jsonObject = JSONObject.parseObject(value);
// {"before":{"name":"dawang","id":"7","age":"22"},"dbName":"test","after":{},"operation":"DELETe","tableName":"student"}
// {"before":{},"dbName":"test","after":{"name":"dawang","id":"7","age":"33"},"operation":"READ","tableName":"student"}
// {"before":{"name":"dawang","id":"7","age":"22"},"dbName":"test","after":{"name":"dawang","id":"7","age":"44"},"operation":"UPDATE","tableName":"student"}
final JSONObject after = jsonObject.getJSONObject("after");
final JSONObject before = jsonObject.getJSONObject("before");
final String beforeClazzId = before.getString("clazz_id");
final String clazzId = after.getString("clazz_id");
final String clazzName = after.getString("clazz_name");
final String operation = jsonObject.getString("operation");
switch (operation) {
case "DELETE":
System.out.println("移除: " + beforeClazzId);
broadcastState.remove(beforeClazzId);
break;
case "READ":
case "UPDATE":
default:
broadcastState.put(clazzId, clazzName);
}
}
});
processDs.print("process: ");
// dimSource.print("dim");
// studentDS.map(new MapFunction() {
// @Override
// public Student map(Student value) throws Exception {
// return null;
// }
// });
// studentDS.print("student: ");
env.execute("student");
}
}
自定义反序列化
import com.alibaba.fastjson.JSONObject; import com.ververica.cdc.debezium.DebeziumDeserializationSchema; import io.debezium.data.Envelope; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.table.runtime.typeutils.StringDataTypeInfo; import org.apache.flink.util.Collector; import org.apache.kafka.connect.data.Field; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; import java.util.List; public class MyDeseUtil implements DebeziumDeserializationSchematable_sql{ @Override public void deserialize(SourceRecord record, Collector out) throws Exception { JSONObject jsonObject = new JSONObject(); // mysql_binlog_source.test.student final String topic = record.topic(); final String[] splits = topic.split("\."); String dbName = splits[1]; String tableName = splits[2]; jsonObject.put("dbName", dbName); jsonObject.put("tableName", tableName); final Schema schema = record.valueSchema(); final Struct value = (Struct) record.value(); // final Struct before = value.getStruct("before"); final JSONObject before = supplyJSONObject(value, "before"); final JSONObject after = supplyJSONObject(value, "after"); jsonObject.put("before", before); jsonObject.put("after", after); //获取操作类型 final Envelope.Operation operation = Envelope.operationFor(record); jsonObject.put("operation", operation); out.collect(jsonObject.toJSONString()); } @Override public TypeInformation getProducedType() { return BasicTypeInfo.STRING_TYPE_INFO; } private JSONObject supplyJSONObject(Struct struct, String structName) { JSONObject jsonObject = new JSONObject(); final Struct udfStruct = struct.getStruct(structName); if (null != udfStruct) { final Schema schema = udfStruct.schema(); final List fields = schema.fields(); fields.forEach( field -> { jsonObject.put(field.name(), udfStruct.getString(field.name())); }); } return jsonObject; } }
CREATE TABLE `clazz_table` ( `clazz_id` varchar(11) NOT NULL, `clazz_name` varchar(255) NOT NULL, PRIMARY KEY (`clazz_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8运行效果截图 未识别的 calzz_id 为5 添加 clazz_id 为5的 clazz 信息,然后删除。 Mysql 语句。
INSERT into clazz_table VALUES("5", "五班");
DELETE FROM clazz_table where clazz_id = '5';
-- 自行测试 update 语句。
-- TODO test UPDATE clazz_table set clazz_name= "x班" where id = 8;



