From 886aba423f3659ef591903f1f3dea87f8b4c6016 Mon Sep 17 00:00:00 2001 From: Igor Poboiko Date: Mon, 20 Mar 2023 13:20:33 +0000 Subject: [PATCH] [TermGenerator] Skip all unprintable characters Some extractors can produce text which includes special unicode control characters (e.g. Poppler can give us 0x0001 from some PDFs). TermGenerator then generates proper (yet meaningless) terms out of those characters, and they end up in database. It should be safe to skip all unprintable characters to avoid that (although surrogates are fine, they are dealt with later via QString::normalize call). Character 0x0001 is the worst, as it is used internally in DocTermsCodec for compactification. Such collision then leads to the corrupted database (some terms from DocTermsDB are not present in PostingDB). The corruption is not hypothetical (although not critical), I've encountered bunch of broken DB entries for some PDF files on my machine. (cherry picked from commit 492321e53a41762555ba6528e15cd0d0188ed153) --- autotests/unit/engine/termgeneratortest.cpp | 11 +++++++++++ src/engine/termgenerator.cpp | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/autotests/unit/engine/termgeneratortest.cpp b/autotests/unit/engine/termgeneratortest.cpp index 361c4934c..69885c133 100644 --- a/autotests/unit/engine/termgeneratortest.cpp +++ b/autotests/unit/engine/termgeneratortest.cpp @@ -31,6 +31,7 @@ private Q_SLOTS: void testWordPositions(); void testWordPositionsCJK(); void testNumbers(); + void testControlCharacter(); QList allWords(const QString& str) { @@ -213,6 +214,16 @@ void TermGeneratorTest::testNumbers() QCOMPARE(words, expectedWords); } +void TermGeneratorTest::testControlCharacter() +{ + QString str = QString::fromUtf8("word1\u0001word2"); + + QList words = allWords(str); + QList expectedWords = { "word1", "word2" }; + + QCOMPARE(words, expectedWords); +} + QTEST_MAIN(TermGeneratorTest) #include "termgeneratortest.moc" diff --git a/src/engine/termgenerator.cpp b/src/engine/termgenerator.cpp index d98b28416..832962da1 100644 --- a/src/engine/termgenerator.cpp +++ b/src/engine/termgenerator.cpp @@ -59,7 +59,7 @@ QByteArrayList TermGenerator::termList(const QString& text_) int start = 0; auto isSkipChar = [] (const QChar& c) { - return c.isPunct() || c.isMark() || c.isSpace(); + return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate()); }; QByteArrayList list; -- GitLab