diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-03-01 13:50:15 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-03-01 13:50:15 +0100 |
commit | 31c83c05d257c9a7a336f12342c401f97d380674 (patch) | |
tree | 80beddd6a2fac6990dd6e153dd4d3375c7a280bd /src/core/parser/utils | |
parent | cb6cacdc7eade9d4290767bafb7ccf4e935d0fbf (diff) |
Prefer longer non-primary tokens
Diffstat (limited to 'src/core/parser/utils')
-rw-r--r-- | src/core/parser/utils/Tokenizer.cpp | 45 |
1 files changed, 22 insertions, 23 deletions
diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 94d9cb0..8d540a6 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -188,7 +188,7 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!bestMatch.hasMatch()) { + if (!bestMatch.hasMatch() || !bestMatch.primary) { lookups.emplace_back(root, charStart, dataStartOffset); } @@ -201,36 +201,35 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) continue; } - // If the matched token is primary, check whether it is better than - // the current best match, if yes, replace the best match. In any - // case just continue - if (match.primary) { - if (match.size() > bestMatch.size()) { - bestMatch = match; - } - continue; + // Replace the best match with longest token + if (match.size() > bestMatch.size()) { + bestMatch = match; } - // Otherwise -- if the matched token is a non-primary token (and no - // primary token has been found until now) -- mark the match in the - // TokenizedData - if (!bestMatch.hasMatch()) { + // If the matched token is a non-primary token -- mark the match in + // the TokenizedData list + if (!match.primary) { data.mark(match.token.id, data.size() - match.size() + 1, match.size()); } } - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (bestMatch.hasMatch()) { - if ((nextLookups.empty() || data.size() > initialDataSize)) { + + // If a token has been found and the token is a primary token, check + // whether we have to abort, otherwise if we have a non-primary match, + // reset it once it can no longer be advanced + if (bestMatch.hasMatch() && nextLookups.empty()) { + if (bestMatch.primary) { break; + } else { + bestMatch = TokenMatch{}; } - } else { - // Record all incomming characters - data.append(c, charStart, charEnd); } + // Record all incomming characters + data.append(c, charStart, charEnd); + + // Swap the lookups and the nextLookups list lookups = std::move(nextLookups); nextLookups.clear(); @@ -241,17 +240,17 @@ bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) // If we found data, emit a corresponding data token if (data.size() > initialDataSize && - (!bestMatch.hasMatch() || + (!bestMatch.hasMatch() || !bestMatch.primary || bestMatch.dataStartOffset > initialDataSize)) { // If we have a "bestMatch" wich starts after text data has started, // trim the TokenizedData to this offset - if (bestMatch.dataStartOffset > initialDataSize) { + if (bestMatch.dataStartOffset > initialDataSize && bestMatch.primary) { data.trim(bestMatch.dataStartOffset); } // Create a token containing the data location bestMatch.token = Token{data.getLocation()}; - } else if (bestMatch.hasMatch() && + } else if (bestMatch.hasMatch() && bestMatch.primary && bestMatch.dataStartOffset == initialDataSize) { data.trim(initialDataSize); } |