Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/tesseract/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ class TESS_API TessBaseAPI {
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *GetTSVText(int page_number);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an API change. It requires a new major version (Tesseract 6.0.0) and changes in other software like for example tesserocr.

Therefore we cannot simply merge this pull request.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood. If you have a suggestion how to provide this functionality without modifying the API, I could steer the PR in that direction.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use overload

char *GetTSVText(int page_number, bool lang_info=false);
to
char *GetTSVText(int page_number, bool lang_info);

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have pushed the overload approach, it does not break the API now.

char *GetTSVText(int page_number, bool lang_info=false);

/**
* Make a box file for LSTM training from the internal data structures.
Expand Down
4 changes: 2 additions & 2 deletions include/tesseract/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ class TESS_API TessAltoRenderer : public TessResultRenderer {
*/
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessTsvRenderer(const char *outputbase, bool font_info);
explicit TessTsvRenderer(const char *outputbase, bool lang_info);
explicit TessTsvRenderer(const char *outputbase);

protected:
Expand All @@ -206,7 +206,7 @@ class TESS_API TessTsvRenderer : public TessResultRenderer {
bool EndDocumentHandler() override;

private:
bool font_info_; // whether to print font information
bool lang_info_; // whether to print language information
};

/**
Expand Down
40 changes: 34 additions & 6 deletions src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1421,7 +1421,7 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetTSVText(int page_number) {
char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
return nullptr;
}
Expand All @@ -1434,6 +1434,7 @@ char *TessBaseAPI::GetTSVText(int page_number) {
int par_num = 0;
int line_num = 0;
int word_num = 0;
std::string lang;

std::string tsv_str;
tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
Expand All @@ -1445,7 +1446,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(rect_top_);
tsv_str += "\t" + std::to_string(rect_width_);
tsv_str += "\t" + std::to_string(rect_height_);
tsv_str += "\t-1\t\n";
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t" + lang;
}
tsv_str += "\t\n";

const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
Expand All @@ -1466,9 +1471,16 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for block
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t";
}
tsv_str += "\t\n"; // end of row for block
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
if (lang_info) {
lang = res_it->WordRecognitionLanguage();
}
par_num++;
line_num = 0;
word_num = 0;
Expand All @@ -1478,7 +1490,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for para
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t" + lang;
}
tsv_str += "\t\n"; // end of row for para
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
line_num++;
Expand All @@ -1489,7 +1505,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for line
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t";
}
tsv_str += "\t\n"; // end of row for line
}

// Now, process the word...
Expand All @@ -1506,9 +1526,17 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(right - left);
tsv_str += "\t" + std::to_string(bottom - top);
tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
tsv_str += "\t";

if (lang_info) {
const char *word_lang = res_it->WordRecognitionLanguage();
tsv_str += "\t";
if (word_lang) {
tsv_str += word_lang;
}
}

// Increment counts if at end of block/paragraph/textline.
tsv_str += "\t";
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
lcnt++;
}
Expand Down
14 changes: 9 additions & 5 deletions src/api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,19 +156,23 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
* TSV Text Renderer interface implementation
**********************************************************************/
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
lang_info_ = false;
}

TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool lang_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
lang_info_ = lang_info;
}

bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString(
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
"num\tleft\ttop\twidth\theight\tconf\t");
if (lang_info_) {
AppendString("lang\t");
}
AppendString("text\n");
return true;
}

Expand All @@ -177,7 +181,7 @@ bool TessTsvRenderer::EndDocumentHandler() {
}

bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum(), lang_info_));
if (tsv == nullptr) {
return false;
}
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ Tesseract::Tesseract()
this->params())
, BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
, BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
, BOOL_MEMBER(tsv_lang_info, false, "Include language info in the .tsv output file", this->params())
, BOOL_MEMBER(poly_allow_detailed_fx, false,
"Allow feature extractors to see the original outline", this->params())
, BOOL_INIT_MEMBER(tessedit_init_config_only, false,
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_flip_0O);
double_VAR_H(tessedit_lower_flip_hyphen);
double_VAR_H(tessedit_upper_flip_hyphen);
BOOL_VAR_H(tsv_lang_info);
BOOL_VAR_H(rej_trust_doc_dawg);
BOOL_VAR_H(rej_1Il_use_dict_word);
BOOL_VAR_H(rej_1Il_trust_permuter_type);
Expand Down
6 changes: 3 additions & 3 deletions src/tesseract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,9 +533,9 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,

api.GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
api.GetBoolVariable("hocr_font_info", &font_info);
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, font_info);
bool lang_info;
api.GetBoolVariable("tsv_lang_info", &lang_info);
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, lang_info);
if (renderer->happy()) {
renderers.push_back(std::move(renderer));
} else {
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/tsv
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
tessedit_create_tsv 1
tsv_lang_info 0