DataX源码分析 TaskGroupContainer
系列文章目录
一、DataX详解和架构介绍
二、DataX源码分析 JobContainer
三、DataX源码分析 TaskGroupContainer
四、DataX源码分析 TaskExecutor
五、DataX源码分析 reader
六、DataX源码分析 writer
七、DataX源码分析 Channel
文章目录
- 系列文章目录
- TaskGroupContainer
- 初始化
- start方法详细步骤
- 1、初始化task执行相关的状态信息
- 2、开始执行任务while (true)循环
- TaskGroupContainer源码
TaskGroupContainer
DataX的TaskGroupContainer是JobContainer将所有task分配到TaskGroup中执行的一个容器。这个容器的主要入口是start方法,该方法会执行两个主要任务:
初始化task执行相关的状态信息和 循环检测所有任务的执行状态 。此外,TaskGroupContainer还有一个名为reportTaskGroupCommunication的方法,用于向容器汇报状态。这个方法会收集当前TaskGroupContainer对应所有Task的通信信息,并将其合并成一个通信信息。
初始化
设置配置文件Configuration
初始化监控
设置jobId
设置taskGroupId
设置channel类实例channelClazz
start方法详细步骤
1、初始化task执行相关的状态信息
- taskConfigMap:taskId与其对应的Congifuration的map映射集合
- 待运行的任务队列taskQueue
- 运行失败任务taskFailedExecutorMap
- 正在执行的任务集合runTasks
- 任务开始时间taskStartTimeMap。
2、开始执行任务while (true)循环
1.判断task状态
循环遍历所有任务,如果任务尚未完成跳过。如果任务已经完成从任务列表中删除。如果任务失败判断是否支持重试,如支持重试并重试次数没有超过最大限制则重试执行。
2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
3.有任务未执行,且正在运行的任务数小于最大通道限制,创建TaskExecutor实例,调用doStart真正执行数据同步任务,从待运行task列表中删除同时加入到正在运行的队列。TaskExecutor构建的时候,生成一个reader、channel和writer,并启动两个线程,reader生产数据写入channel,writer从channel中读数据,任务执行完毕时,通过wirter将任务状态置为成功。
4.检查执行队列和所有的任务状态,如果所有的任务都执行成功,则汇报taskGroup的状态并从循环中退出。
5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
6.当所有的执行完成从while中退出之后,再次全局汇报当前的任务状态
TaskGroupContainer源码
/**
* task任务运行容器
**/
public class TaskGroupContainer extends AbstractContainer {
private static final Logger LOG = LoggerFactory
.getLogger(TaskGroupContainer.class);
/**
* 当前taskGroup所属jobId
*/
private long jobId;
/**
* 当前taskGroupId
*/
private int taskGroupId;
/**
* 使用的channel类
*/
private String channelClazz;
/**
* task收集器使用的类
*/
private String taskCollectorClass;
private TaskMonitor taskMonitor = TaskMonitor.getInstance();
public TaskGroupContainer(Configuration configuration) {
super(configuration);
initCommunicator(configuration);
this.jobId = this.configuration.getLong(
CoreConstant.DATAX_CORE_CONTAINER_JOB_ID);
this.taskGroupId = this.configuration.getInt(
CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID);
this.channelClazz = this.configuration.getString(
CoreConstant.DATAX_CORE_TRANSPORT_CHANNEL_CLASS);
this.taskCollectorClass = this.configuration.getString(
CoreConstant.DATAX_CORE_STATISTICS_COLLECTOR_PLUGIN_TASKCLASS);
}
private void initCommunicator(Configuration configuration) {
super.setContainerCommunicator(new StandaloneTGContainerCommunicator(configuration));
}
public long getJobId() {
return jobId;
}
public int getTaskGroupId() {
return taskGroupId;
}
@Override
public void start() {
try {
/**
* 状态check时间间隔,较短,可以把任务及时分发到对应channel中
*/
int sleepIntervalInMillSec = this.configuration.getInt(
CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
/**
* 状态汇报时间间隔,稍长,避免大量汇报
*/
long reportIntervalInMillSec = this.configuration.getLong(
CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL,
10000);
/**
* 2分钟汇报一次性能统计
*/
// 获取channel数目
int channelNumber = this.configuration.getInt(
CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);
int taskMaxRetryTimes = this.configuration.getInt(
CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);
long taskRetryIntervalInMsec = this.configuration.getLong(
CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);
long taskMaxWaitInMsec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);
List<Configuration> taskConfigs = this.configuration
.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
if(LOG.isDebugEnabled()) {
LOG.debug("taskGroup[{}]'s task configs[{}]", this.taskGroupId,
JSON.toJSONString(taskConfigs));
}
int taskCountInThisTaskGroup = taskConfigs.size();
LOG.info(String.format(
"taskGroupId=[%d] start [%d] channels for [%d] tasks.",
this.taskGroupId, channelNumber, taskCountInThisTaskGroup));
this.containerCommunicator.registerCommunication(taskConfigs);
Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); //taskId与task配置
List<Configuration> taskQueue = buildRemainTasks(taskConfigs); //待运行task列表
Map<Integer, TaskExecutor> taskFailedExecutorMap = new HashMap<Integer, TaskExecutor>(); //taskId与上次失败实例
List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); //正在运行task
Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); //任务开始时间
long lastReportTimeStamp = 0;
Communication lastTaskGroupContainerCommunication = new Communication();
while (true) {
//1.判断task状态
boolean failedOrKilled = false;
Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
for(Map.Entry<Integer, Communication> entry : communicationMap.entrySet()){
Integer taskId = entry.getKey();
Communication taskCommunication = entry.getValue();
if(!taskCommunication.isFinished()){
continue;
}
TaskExecutor taskExecutor = removeTask(runTasks, taskId);
//上面从runTasks里移除了,因此对应在monitor里移除
taskMonitor.removeTask(taskId);
//失败,看task是否支持failover,重试次数未超过最大限制
if(taskCommunication.getState() == State.FAILED){
taskFailedExecutorMap.put(taskId, taskExecutor);
if(taskExecutor.supportFailOver() && taskExecutor.getAttemptCount() < taskMaxRetryTimes){
taskExecutor.shutdown(); //关闭老的executor
containerCommunicator.resetCommunication(taskId); //将task的状态重置
Configuration taskConfig = taskConfigMap.get(taskId);
taskQueue.add(taskConfig); //重新加入任务列表
}else{
failedOrKilled = true;
break;
}
}else if(taskCommunication.getState() == State.KILLED){
failedOrKilled = true;
break;
}else if(taskCommunication.getState() == State.SUCCEEDED){
Long taskStartTime = taskStartTimeMap.get(taskId);
if(taskStartTime != null){
Long usedTime = System.currentTimeMillis() - taskStartTime;
LOG.info("taskGroup[{}] taskId[{}] is successed, used[{}]ms",
this.taskGroupId, taskId, usedTime);
//usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
PerfRecord.addPerfRecord(taskGroupId, taskId, PerfRecord.PHASE.TASK_TOTAL,taskStartTime, usedTime * 1000L * 1000L);
taskStartTimeMap.remove(taskId);
taskConfigMap.remove(taskId);
}
}
}
// 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
if (failedOrKilled) {
lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
throw DataXException.asDataXException(
FrameworkErrorCode.PLUGIN_RUNTIME_ERROR, lastTaskGroupContainerCommunication.getThrowable());
}
//3.有任务未执行,且正在运行的任务数小于最大通道限制
Iterator<Configuration> iterator = taskQueue.iterator();
while(iterator.hasNext() && runTasks.size() < channelNumber){
Configuration taskConfig = iterator.next();
Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
int attemptCount = 1;
TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
if(lastExecutor!=null){
attemptCount = lastExecutor.getAttemptCount() + 1;
long now = System.currentTimeMillis();
long failedTime = lastExecutor.getTimeStamp();
if(now - failedTime < taskRetryIntervalInMsec){ //未到等待时间,继续留在队列
continue;
}
if(!lastExecutor.isShutdown()){ //上次失败的task仍未结束
if(now - failedTime > taskMaxWaitInMsec){
markCommunicationFailed(taskId);
reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
throw DataXException.asDataXException(CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
}else{
lastExecutor.shutdown(); //再次尝试关闭
continue;
}
}else{
LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown",
this.taskGroupId, taskId, lastExecutor.getAttemptCount());
}
}
Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
taskStartTimeMap.put(taskId, System.currentTimeMillis());
taskExecutor.doStart();
iterator.remove();
runTasks.add(taskExecutor);
//上面,增加task到runTasks列表,因此在monitor里注册。
taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));
taskFailedExecutorMap.remove(taskId);
LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] is started",
this.taskGroupId, taskId, attemptCount);
}
//4.任务列表为空,executor已结束, 搜集状态为success--->成功
if (taskQueue.isEmpty() && isAllTaskDone(runTasks) && containerCommunicator.collectState() == State.SUCCEEDED) {
// 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
break;
}
// 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
long now = System.currentTimeMillis();
if (now - lastReportTimeStamp > reportIntervalInMillSec) {
lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
lastReportTimeStamp = now;
//taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
for(TaskExecutor taskExecutor:runTasks){
taskMonitor.report(taskExecutor.getTaskId(),this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
}
}
Thread.sleep(sleepIntervalInMillSec);
}
//6.最后还要汇报一次
reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
} catch (Throwable e) {
Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
if (nowTaskGroupContainerCommunication.getThrowable() == null) {
nowTaskGroupContainerCommunication.setThrowable(e);
}
nowTaskGroupContainerCommunication.setState(State.FAILED);
this.containerCommunicator.report(nowTaskGroupContainerCommunication);
throw DataXException.asDataXException(
FrameworkErrorCode.RUNTIME_ERROR, e);
}finally {
if(!PerfTrace.getInstance().isJob()){
//最后打印cpu的平均消耗,GC的统计
VMInfo vmInfo = VMInfo.getVmInfo();
if (vmInfo != null) {
vmInfo.getDelta(false);
LOG.info(vmInfo.totalString());
}
LOG.info(PerfTrace.getInstance().summarizeNoException());
}
}
}
private Map<Integer, Configuration> buildTaskConfigMap(List<Configuration> configurations){
Map<Integer, Configuration> map = new HashMap<Integer, Configuration>();
for(Configuration taskConfig : configurations){
int taskId = taskConfig.getInt(CoreConstant.TASK_ID);
map.put(taskId, taskConfig);
}
return map;
}
private List<Configuration> buildRemainTasks(List<Configuration> configurations){
List<Configuration> remainTasks = new LinkedList<Configuration>();
for(Configuration taskConfig : configurations){
remainTasks.add(taskConfig);
}
return remainTasks;
}
private TaskExecutor removeTask(List<TaskExecutor> taskList, int taskId){
Iterator<TaskExecutor> iterator = taskList.iterator();
while(iterator.hasNext()){
TaskExecutor taskExecutor = iterator.next();
if(taskExecutor.getTaskId() == taskId){
iterator.remove();
return taskExecutor;
}
}
return null;
}
private boolean isAllTaskDone(List<TaskExecutor> taskList){
for(TaskExecutor taskExecutor : taskList){
if(!taskExecutor.isTaskFinished()){
return false;
}
}
return true;
}
private Communication reportTaskGroupCommunication(Communication lastTaskGroupContainerCommunication, int taskCount){
Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
nowTaskGroupContainerCommunication.setTimestamp(System.currentTimeMillis());
Communication reportCommunication = CommunicationTool.getReportCommunication(nowTaskGroupContainerCommunication,
lastTaskGroupContainerCommunication, taskCount);
this.containerCommunicator.report(reportCommunication);
return reportCommunication;
}
private void markCommunicationFailed(Integer taskId){
Communication communication = containerCommunicator.getCommunication(taskId);
communication.setState(State.FAILED);
}
/**
* TaskExecutor是一个完整task的执行器
* 其中包括1:1的reader和writer
*/
class TaskExecutor {
private Configuration taskConfig;
private int taskId;
private int attemptCount;
private Channel channel;
private Thread readerThread;
private Thread writerThread;
private ReaderRunner readerRunner;
private WriterRunner writerRunner;
/**
* 该处的taskCommunication在多处用到:
* 1. channel
* 2. readerRunner和writerRunner
* 3. reader和writer的taskPluginCollector
*/
private Communication taskCommunication;
public TaskExecutor(Configuration taskConf, int attemptCount) {
// 获取该taskExecutor的配置
this.taskConfig = taskConf;
Validate.isTrue(null != this.taskConfig.getConfiguration(CoreConstant.JOB_READER)
&& null != this.taskConfig.getConfiguration(CoreConstant.JOB_WRITER),
"[reader|writer]的插件参数不能为空!");
// 得到taskId
this.taskId = this.taskConfig.getInt(CoreConstant.TASK_ID);
this.attemptCount = attemptCount;
/**
* 由taskId得到该taskExecutor的Communication
* 要传给readerRunner和writerRunner,同时要传给channel作统计用
*/
this.taskCommunication = containerCommunicator
.getCommunication(taskId);
Validate.notNull(this.taskCommunication,
String.format("taskId[%d]的Communication没有注册过", taskId));
this.channel = ClassUtil.instantiate(channelClazz,
Channel.class, configuration);
this.channel.setCommunication(this.taskCommunication);
/**
* 获取transformer的参数
*/
List<TransformerExecution> transformerInfoExecs = TransformerUtil.buildTransformerInfo(taskConfig);
/**
* 生成writerThread
*/
writerRunner = (WriterRunner) generateRunner(PluginType.WRITER);
this.writerThread = new Thread(writerRunner,
String.format("%d-%d-%d-writer",
jobId, taskGroupId, this.taskId));
//通过设置thread的contextClassLoader,即可实现同步和主程序不通的加载器
this.writerThread.setContextClassLoader(LoadUtil.getJarLoader(
PluginType.WRITER, this.taskConfig.getString(
CoreConstant.JOB_WRITER_NAME)));
/**
* 生成readerThread
*/
readerRunner = (ReaderRunner) generateRunner(PluginType.READER,transformerInfoExecs);
this.readerThread = new Thread(readerRunner,
String.format("%d-%d-%d-reader",
jobId, taskGroupId, this.taskId));
/**
* 通过设置thread的contextClassLoader,即可实现同步和主程序不通的加载器
*/
this.readerThread.setContextClassLoader(LoadUtil.getJarLoader(
PluginType.READER, this.taskConfig.getString(
CoreConstant.JOB_READER_NAME)));
}
public void doStart() {
this.writerThread.start();
// reader没有起来,writer不可能结束
if (!this.writerThread.isAlive() || this.taskCommunication.getState() == State.FAILED) {
throw DataXException.asDataXException(
FrameworkErrorCode.RUNTIME_ERROR,
this.taskCommunication.getThrowable());
}
this.readerThread.start();
// 这里reader可能很快结束
if (!this.readerThread.isAlive() && this.taskCommunication.getState() == State.FAILED) {
// 这里有可能出现Reader线上启动即挂情况 对于这类情况 需要立刻抛出异常
throw DataXException.asDataXException(
FrameworkErrorCode.RUNTIME_ERROR,
this.taskCommunication.getThrowable());
}
}
private AbstractRunner generateRunner(PluginType pluginType) {
return generateRunner(pluginType, null);
}
private AbstractRunner generateRunner(PluginType pluginType, List<TransformerExecution> transformerInfoExecs) {
AbstractRunner newRunner = null;
TaskPluginCollector pluginCollector;
switch (pluginType) {
case READER:
newRunner = LoadUtil.loadPluginRunner(pluginType,
this.taskConfig.getString(CoreConstant.JOB_READER_NAME));
newRunner.setJobConf(this.taskConfig.getConfiguration(
CoreConstant.JOB_READER_PARAMETER));
pluginCollector = ClassUtil.instantiate(
taskCollectorClass, AbstractTaskPluginCollector.class,
configuration, this.taskCommunication,
PluginType.READER);
RecordSender recordSender;
if (transformerInfoExecs != null && transformerInfoExecs.size() > 0) {
recordSender = new BufferedRecordTransformerExchanger(taskGroupId, this.taskId, this.channel,this.taskCommunication ,pluginCollector, transformerInfoExecs);
} else {
recordSender = new BufferedRecordExchanger(this.channel, pluginCollector);
}
((ReaderRunner) newRunner).setRecordSender(recordSender);
/**
* 设置taskPlugin的collector,用来处理脏数据和job/task通信
*/
newRunner.setTaskPluginCollector(pluginCollector);
break;
case WRITER:
newRunner = LoadUtil.loadPluginRunner(pluginType,
this.taskConfig.getString(CoreConstant.JOB_WRITER_NAME));
newRunner.setJobConf(this.taskConfig
.getConfiguration(CoreConstant.JOB_WRITER_PARAMETER));
pluginCollector = ClassUtil.instantiate(
taskCollectorClass, AbstractTaskPluginCollector.class,
configuration, this.taskCommunication,
PluginType.WRITER);
((WriterRunner) newRunner).setRecordReceiver(new BufferedRecordExchanger(
this.channel, pluginCollector));
/**
* 设置taskPlugin的collector,用来处理脏数据和job/task通信
*/
newRunner.setTaskPluginCollector(pluginCollector);
break;
default:
throw DataXException.asDataXException(FrameworkErrorCode.ARGUMENT_ERROR, "Cant generateRunner for:" + pluginType);
}
newRunner.setTaskGroupId(taskGroupId);
newRunner.setTaskId(this.taskId);
newRunner.setRunnerCommunication(this.taskCommunication);
return newRunner;
}
// 检查任务是否结束
private boolean isTaskFinished() {
// 如果reader 或 writer没有完成工作,那么直接返回工作没有完成
if (readerThread.isAlive() || writerThread.isAlive()) {
return false;
}
if(taskCommunication==null || !taskCommunication.isFinished()){
return false;
}
return true;
}
private int getTaskId(){
return taskId;
}
private long getTimeStamp(){
return taskCommunication.getTimestamp();
}
private int getAttemptCount(){
return attemptCount;
}
private boolean supportFailOver(){
return writerRunner.supportFailOver();
}
private void shutdown(){
writerRunner.shutdown();
readerRunner.shutdown();
if(writerThread.isAlive()){
writerThread.interrupt();
}
if(readerThread.isAlive()){
readerThread.interrupt();
}
}
private boolean isShutdown(){
return !readerThread.isAlive() && !writerThread.isAlive();
}
}
}