TensorFlow 中的 CTCGreedyDecoder 仅包含 CPU 实现。而 PaddlePaddle 框架则更贴近实际需求,可以在 GPU 上运行。简单来说,PaddlePaddle 内部通过拼接方式,先通过 topk 算子找到最大类别,然后通过 CTCAlignOp 完成后处理。TensorFlow 的输出格式为 SparseTensor,而 PaddlePaddle 支持 Tensor 和 LoDTensor 两种形式。
ctc_greedy_decodercheck_variable_and_dtype 检查变量的类型以及数据类型。
LayerHelper 主要是在各个 layers 函数之间共享代码。
内部调用 topk 算子得到最大概率类别的索引topk_indices。
check_variable_and_dtype(input, 'input', ['float32', 'float64'],
'ctc_greedy_decoder')
helper = LayerHelper("ctc_greedy_decoder", **locals())
_, topk_indices = topk(input, k=1)
LayerHelperBase.create_variable_for_type_inference 创建临时变量。
lod 模式直接通过 ctc_align 来得到最终结果;padding 模式下输入是3维的,需要创建ctc_out_len并调用 squeeze 算子去掉最后一维。
# ctc align op
ctc_out = helper.create_variable_for_type_inference(dtype="int64")
if input_length is None:
helper.append_op(
type="ctc_align",
inputs={"Input": [topk_indices]},
outputs={"Output": [ctc_out]},
attrs={"merge_repeated": True,
"blank": blank})
return ctc_out
else:
ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
ctc_input = squeeze(topk_indices, [2])
helper.append_op(
type="ctc_align",
inputs={"Input": [ctc_input],
"InputLength": [input_length]},
outputs={"Output": [ctc_out],
"OutputLength": [ctc_out_len]},
attrs={
"merge_repeated": True,
"blank": blank,
"padding_value": padding_value
})
return ctc_out, ctc_out_len
根据 REGISTER_OPERATOR 可以找到 Python 函数名和算子实现的对应关系。
CTCAlignOpOperatorBase 是网络计算的基本元素。
OperatorWithKernel
OP_INOUT_CHECK 确保算子有输入输出。
OperatorWithKernel::IndicateVarDataType 获取变量数据类型。
class CTCAlignOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ctc_align");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ctc_align");
auto input_dims = ctx->GetInputDim("Input");
// TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
ctx->SetOutputDim("Output", input_dims);
if (ctx->HasInput("InputLength")) {
ctx->SetOutputDim("OutputLength", {input_dims[0], 1});
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
ctx.device_context());
}
};
CTCAlignKernel
模板默认是 CPU 实现。
templateclass CTCAlignKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input ("Input"); auto* output = ctx.Output ("Output"); size_t blank = static_cast (ctx.Attr ("blank")); bool merge_repeated = ctx.Attr ("merge_repeated"); T* output_data = output->mutable_data (ctx.GetPlace()); auto input_dims = input->dims(); const T* input_data = input->data ();
如果是 padding 模式,处理时跳过空白。
// support tensor input, no lod information
if (input->lod().empty()) {
size_t padding_value =
static_cast(ctx.Attr("padding_value"));
auto* input_length = ctx.Input("InputLength");
const T* input_length_data = input_length->data();
auto* output_length = ctx.Output("OutputLength");
T* output_length_data = output_length->mutable_data(ctx.GetPlace());
for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
batch_id++) {
T prev_token = -1;
size_t output_idx = 0;
for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
size_t input_ind = batch_id * input_dims[1] + i;
if ((unsigned)input_data[input_ind] != blank &&
!(merge_repeated && input_data[input_ind] == prev_token)) {
output_data[batch_id * input_dims[1] + output_idx] =
input_data[input_ind];
++output_idx;
}
prev_token = input_data[input_ind];
}
output_length_data[batch_id] = output_idx;
for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
output_data[batch_id * input_dims[1] + j] = padding_value;
}
如果是 lod 模式,调用 ToAbsOffset 得到偏移。
} else {
const size_t level = 0;
auto input_lod = framework::ToAbsOffset(input->lod());
// check input dims and lod
PADDLE_ENFORCE_EQ(
input_dims[0], static_cast(input_lod[level].back()),
platform::errors::InvalidArgument(
"The first dimension %d of CTCAlign operator Input(Input) should "
"be equal to "
"the sum of all sequences' lengths %d.",
input_dims[0], static_cast(input_lod[level].back())));
const size_t num_sequences = input_lod[level].size() - 1;
// merge repeated tokens and delete blank
size_t output_idx = 0;
std::vector output_lod0(1, 0);
for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
T prev_token = -1;
for (size_t i = input_lod[level][seq_idx];
i < input_lod[level][seq_idx + 1]; ++i) {
if ((unsigned)input_data[i] != blank &&
!(merge_repeated && input_data[i] == prev_token)) {
output_data[output_idx] = input_data[i];
++output_idx;
}
prev_token = input_data[i];
}
output_lod0.push_back(output_idx);
}
// set output lod
framework::LoD output_lod;
output_lod.push_back(output_lod0);
output->set_lod(output_lod);
// resize output dims
output->Resize({static_cast(output_lod0.back()), 1});
// for empty sequence
if (output_lod0.back() == 0) {
output->Resize({1, 1});
output_data = output->mutable_data(ctx.GetPlace());
output_data[0] = -1;
}
}
}
};
CTCAlignOpCUDAKernel
ExecutionContext::Input 根据名称返回地址。
ExecutionContext::Attr
LoDTensor 为 DenseTensor 类型。
templateclass CTCAlignOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::InvalidArgument( "CTCAlign operator CUDA kernel must use CUDAPlace " "rather than CPUPlace.")); auto* input = ctx.Input ("Input"); auto* output = ctx.Output ("Output"); const int blank = ctx.Attr ("blank"); const int merge_repeated = static_cast (ctx.Attr ("merge_repeated")); const T* tokens = input->data (); auto stream = ctx.cuda_device_context().stream();
DenseTensor::lod 返回 DenseTensorMeta 包含的 LoD 对象。
如果输入没有 Level-of-Detail,为普通 Tensor,调用 PaddingMergeAndDelCudaKernel 函数。
DenseTensor::mutable_data 返回数据指针。
// tensor input which has no lod
if (input->lod().empty()) {
const int padding_value = ctx.Attr("padding_value");
auto input_dims = input->dims();
T* output_data = output->mutable_data({input_dims[0], input_dims[1]},
ctx.GetPlace());
auto* input_length = ctx.Input("InputLength");
const T* input_length_data = input_length->data();
auto* output_length = ctx.Output("OutputLength");
T* output_length_data =
output_length->mutable_data({input_dims[0], 1}, ctx.GetPlace());
PaddingMergeAndDelCudaKernel<
T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
input_dims[1], tokens, input_length_data, blank, merge_repeated,
padding_value, input_dims[0], output_data, output_length_data);
否则调用 MergeAndDelCudaKernel 使用单线程合并删除。
ToAbsOffset 得到偏移。
} else {
const size_t level = 0;
auto input_lod = framework::ToAbsOffset(input->lod());
const int64_t num_tokens = input->dims()[0];
const size_t num_seq = input_lod[level].size() - 1;
// prepare a lod to record lod information while merging elements
thrust::device_vector dev_out_lod0(input_lod[level].size());
size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
// merge elements and delete blank
T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace());
paddle::framework::MixVector mixv_input_lod(&input_lod[level]);
MergeAndDelCudaKernel<<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq,
mixv_input_lod.CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
dev_out_lod0_ptr, output_data);
mixv_input_lod.CopyToCPU();
// set output lod
std::vector host_out_lod0(dev_out_lod0.begin(),
dev_out_lod0.end());
framework::LoD out_lod;
out_lod.push_back(host_out_lod0);
output->set_lod(out_lod);
// resize output dims
output->Resize({static_cast(host_out_lod0.back()), 1});
if (host_out_lod0.back() == 0) {
output->Resize({1, 1});
output->mutable_data(ctx.GetPlace());
phi::funcs::SetConstant set_constant;
set_constant(ctx.template device_context(),
output, -1);
}
}
}
};
PaddingMergeAndDelCudaKernel
每个线程处理单个 batch。
如果tokens不是空白标签并且无需合并时,将数据赋值给输出。
template__global__ void PaddingMergeAndDelCudaKernel( const int64_t num_token, const T* tokens, const T* tokens_length, const int blank, const int merge_repeated, const int padding_value, const int64_t batch_size, T* output, T* output_length) { int ind = blockIdx.x * blockDim.x + threadIdx.x; if (ind >= batch_size) return; int output_idx = ind * num_token; T prev_token = -1; for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) { if ((unsigned)tokens[i] != blank && !(merge_repeated && tokens[i] == prev_token)) { output[output_idx] = tokens[i]; ++output_idx; } prev_token = tokens[i]; }
记录输出长度到output_length。
末尾填充。
output_length[ind] = output_idx - ind * num_token;
for (int i = output_idx; i < ind * num_token + num_token; i++) {
output[i] = padding_value;
}
}
MergeAndDelCudaKernel
对于每个序列,通过lod0得到索引。
跳过空白标签以及需要合并的情况。
out_lod0记录序列起止点的累计索引。
template参考资料:__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, const size_t num_seq, size_t* lod0, const int blank, const int merge_repeated, size_t* out_lod0, T* output) { int ouput_idx = 0; out_lod0[0] = 0; for (int i = 0; i < num_seq; ++i) { T pre_token = -1; for (int j = lod0[i]; j < lod0[i + 1]; ++j) { if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) { output[ouput_idx] = tokens[j]; ++ouput_idx; } pre_token = tokens[j]; } out_lod0[i + 1] = ouput_idx; } }
- Paddle关键概念
- paddle中的LoDTensor
- Paddle 的 LoDTensor
- LoDTensor
- Tensor介绍
- topk
- Paddle关键概念
- LayerHelper没有对应的文档? #24308



