使用umi-ocr识别后,文字的排列顺序有时会有点错乱,可以写一些函数进行重排优化,由于能力有限,无法对其识别后的json文本进行此重排。
以下是对识别后的json文本对ai的提问:
[{"box":[[12,71],[30,71],[30,91],[12,91]],"score":0.99603396654129028,"text":"6","end":" "},
{"box":[[11,120],[30,120],[30,139],[11,139]],"score":0.99924361705780029,"text":"7","end":" "},
{"box":[[12,170],[30,170],[30,190],[12,190]],"score":0.99798190593719482,"text":"8","end":"\n"},
{"box":[[43,17],[356,17],[356,40],[43,40]],"score":0.99932154161589482,"text":"男生复读3年高考查分喜极而泣","end":"\n"},
{"box":[[346,19],[381,19],[381,40],[346,40]],"score":0.99749338626861572,"text":"热","end":"\n"},
{"box":[[375,67],[413,67],[413,90],[375,90]],"score":0.64695890247821808,"text":"日新","end":"\n"},
{"box":[[46,68],[387,68],[387,91],[46,91]],"score":0.98347348372141519,"text":"中方回应特朗普要中国买美国石油","end":"\n"},
{"box":[[358,117],[397,117],[397,140],[358,140]],"score":0.83765721321105957,"text":"分新","end":"\n"},
{"box":[[47,119],[369,119],[369,139],[47,139]],"score":0.99872339197567528,"text":"湖北高二学生参加高考数学满分","end":"\n"},
{"box":[[41,166],[419,164],[420,190],[41,192]],"score":0.95793653235715981,"text":"个人养老金为何要交3%个税?专..","end":"\n"},
{"box":[[409,168],[435,168],[435,189],[409,189]],"score":0.99883073568344116,"text":"新","end":"\n"},
{"box":[[10,217],[315,217],[315,240],[10,240]],"score":0.99904135098824132,"text":"9A股收盘:沪指创年内新高","end":"\n"},
{"box":[[304,218],[337,218],[337,238],[304,238]],"score":0.9944075345993042,"text":"新","end":"\n"}]
这是图片识别的文字返回值:
请用js代码,按照从左到右,从上到下的顺序将其文字提取出来
首先按y坐标对元素进行排序
然后遍历元素,当y坐标差小于5时视为同一行
对每一行内的元素按x坐标从左到右排序
将同一行的内容合并为一个字符串
最后用换行符连接所有行
给出的js代码可以在wps 表格js编辑器中运行,可以得到需要的效果
图片 重排前 重排后
function 识别文字重排()
{
function extractText(data) {
// 1. 按y坐标从上到下排序
data.sort((a, b) => a.box[0][1] - b.box[0][1]);
let lines = [];
let currentLine = [];
let lastY = null;
for (let i = 0; i < data.length; i++) {
const item = data[i];
const y = item.box[0][1]; // 使用左上角的y坐标
// 2. 当y坐标差小于5时视为同一行
if (lastY !== null && Math.abs(y - lastY) < 5) {
currentLine.push(item);
} else {
// 如果当前行有内容,先处理当前行
if (currentLine.length > 0) {
// 3. 对每一行内的元素按x坐标从左到右排序
currentLine.sort((a, b) => a.box[0][0] - b.box[0][0]);
// 4. 将同一行的内容合并为一个字符串
lines.push(currentLine.map(item => item.text).join(''));
}
// 开始新的一行
currentLine = [item];
}
lastY = y;
}
// 处理最后一行
if (currentLine.length > 0) {
currentLine.sort((a, b) => a.box[0][0] - b.box[0][0]);
lines.push(currentLine.map(item => item.text).join(''));
}
// 5. 用换行符连接所有行
return lines.join('\n');
}
// 示例数据
const data = [
{"box":[[12,71],[30,71],[30,91],[12,91]],"score":0.99603396654129028,"text":"6","end":" "},
{"box":[[11,120],[30,120],[30,139],[11,139]],"score":0.99924361705780029,"text":"7","end":" "},
{"box":[[12,170],[30,170],[30,190],[12,190]],"score":0.99798190593719482,"text":"8","end":"\n"},
{"box":[[43,17],[356,17],[356,40],[43,40]],"score":0.99932154161589482,"text":"男生复读3年高考查分喜极而泣","end":"\n"},
{"box":[[346,19],[381,19],[381,40],[346,40]],"score":0.99749338626861572,"text":"热","end":"\n"},
{"box":[[375,67],[413,67],[413,90],[375,90]],"score":0.64695890247821808,"text":"日新","end":"\n"},
{"box":[[46,68],[387,68],[387,91],[46,91]],"score":0.98347348372141519,"text":"中方回应特朗普要中国买美国石油","end":"\n"},
{"box":[[358,117],[397,117],[397,140],[358,140]],"score":0.83765721321105957,"text":"分新","end":"\n"},
{"box":[[47,119],[369,119],[369,139],[47,139]],"score":0.99872339197567528,"text":"湖北高二学生参加高考数学满分","end":"\n"},
{"box":[[41,166],[419,164],[420,190],[41,192]],"score":0.95793653235715981,"text":"个人养老金为何要交3%个税?专..","end":"\n"},
{"box":[[409,168],[435,168],[435,189],[409,189]],"score":0.99883073568344116,"text":"新","end":"\n"},
{"box":[[10,217],[315,217],[315,240],[10,240]],"score":0.99904135098824132,"text":"9A股收盘:沪指创年内新高","end":"\n"},
{"box":[[304,218],[337,218],[337,238],[304,238]],"score":0.9944075345993042,"text":"新","end":"\n"}
];
// 提取并输出文字
const extractedText = extractText(data);
console.log(extractedText);
}
是否可以用qk将其函数改一下,调用