summaryrefslogtreecommitdiff
path: root/src/plugins
diff options
context:
space:
mode:
authorBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2015-02-08 19:49:17 +0100
committerBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2015-02-08 19:49:17 +0100
commit9ef316ed4ea8542973d272fa9c7b4c6804b28144 (patch)
tree1e884c61b5915f913c8db404cc9137bbe8eae01c /src/plugins
parent05e5a4ab340d0f9f3490e7db9c8e42f70cc471da (diff)
parentf6e7859a835375c25226719a46df99ec11037599 (diff)
Merge branch 'master' of somweyr.de:ousia
Diffstat (limited to 'src/plugins')
-rw-r--r--src/plugins/plain/DynamicTokenizer.cpp513
-rw-r--r--src/plugins/plain/DynamicTokenizer.hpp160
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.cpp276
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.hpp34
-rw-r--r--src/plugins/plain/TokenTrie.cpp (renamed from src/plugins/plain/DynamicTokenTree.cpp)40
-rw-r--r--src/plugins/plain/TokenTrie.hpp (renamed from src/plugins/plain/DynamicTokenTree.hpp)84
6 files changed, 806 insertions, 301 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.cpp b/src/plugins/plain/DynamicTokenizer.cpp
index 7690395..f2cfcd1 100644
--- a/src/plugins/plain/DynamicTokenizer.cpp
+++ b/src/plugins/plain/DynamicTokenizer.cpp
@@ -17,57 +17,528 @@
*/
#include <memory>
-#include <string>
-#include <unordered_map>
+#include <vector>
#include <core/common/CharReader.hpp>
+#include <core/common/Exceptions.hpp>
+#include <core/common/Utils.hpp>
#include "DynamicTokenizer.hpp"
namespace ousia {
+namespace {
+
+/* Internal class TokenMatch */
+
+/**
+ * Contains information about a matching token.
+ */
+struct TokenMatch {
+ /**
+ * Token that was matched.
+ */
+ DynamicToken token;
+
+ /**
+ * Current length of the data within the text handler. The text buffer needs
+ * to be trimmed to this length if this token matches.
+ */
+ size_t textLength;
+
+ /**
+ * End location of the current text handler. This location needs to be used
+ * for the text token that is emitted before the actual token.
+ */
+ size_t textEnd;
+
+ /**
+ * Constructor of the TokenMatch class.
+ */
+ TokenMatch() : textLength(0), textEnd(0) {}
+
+ /**
+ * Returns true if this TokenMatch instance actually represents a match.
+ */
+ bool hasMatch() { return token.type != EmptyToken; }
+};
+
+/* Internal class TokenLookup */
+
+/**
+ * The TokenLookup class is used to represent a thread in a running token
+ * lookup.
+ */
+class TokenLookup {
+private:
+ /**
+ * Current node within the token trie.
+ */
+ TokenTrie::Node const *node;
+
+ /**
+ * Start offset within the source file.
+ */
+ size_t start;
+
+ /**
+ * Current length of the data within the text handler. The text buffer needs
+ * to be trimmed to this length if this token matches.
+ */
+ size_t textLength;
+
+ /**
+ * End location of the current text handler. This location needs to be used
+ * for the text token that is emitted before the actual token.
+ */
+ size_t textEnd;
+
+public:
+ /**
+ * Constructor of the TokenLookup class.
+ *
+ * @param node is the current node.
+ * @param start is the start position.
+ * @param textLength is the text buffer length of the previous text token.
+ * @param textEnd is the current end location of the previous text token.
+ */
+ TokenLookup(const TokenTrie::Node *node, size_t start,
+ size_t textLength, size_t textEnd)
+ : node(node), start(start), textLength(textLength), textEnd(textEnd)
+ {
+ }
+
+ /**
+ * Tries to extend the current path in the token trie with the given
+ * character. If a complete token is matched, stores this match in the
+ * tokens list (in case it is longer than any previous token).
+ *
+ * @param c is the character that should be appended to the current prefix.
+ * @param lookups is a list to which new TokeLookup instances are added --
+ * which could potentially be expanded in the next iteration.
+ * @param match is the DynamicToken instance to which the matching token
+ * should be written.
+ * @param tokens is a reference at the internal token list of the
+ * DynamicTokenizer.
+ * @param end is the end byte offset of the current character.
+ * @param sourceId is the source if of this file.
+ */
+ void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match,
+ const std::vector<std::string> &tokens, SourceOffset end,
+ SourceId sourceId)
+ {
+ // Check whether we can continue the current token path with the given
+ // character without visiting an already visited node
+ auto it = node->children.find(c);
+ if (it == node->children.end()) {
+ return;
+ }
+
+ // Check whether the new node represents a complete token a whether it
+ // is longer than the current token. If yes, replace the current token.
+ node = it->second.get();
+ if (node->type != EmptyToken) {
+ const std::string &str = tokens[node->type];
+ size_t len = str.size();
+ if (len > match.token.content.size()) {
+ match.token =
+ DynamicToken{node->type, str, {sourceId, start, end}};
+ match.textLength = textLength;
+ match.textEnd = textEnd;
+ }
+ }
+
+ // If this state can possibly be advanced, store it in the states list.
+ if (!node->children.empty()) {
+ lookups.emplace_back(*this);
+ }
+ }
+};
+
+/* Internal class TextHandlerBase */
+
+/**
+ * Base class used for those classes that may be used as TextHandler in the
+ * DynamicTokenizer::next function.
+ */
+class TextHandlerBase {
+public:
+ /**
+ * Start position of the extracted text.
+ */
+ size_t textStart;
+
+ /**
+ * End position of the extracted text.
+ */
+ size_t textEnd;
+
+ /**
+ * Buffer containing the extracted text.
+ */
+ std::vector<char> textBuf;
+
+ /**
+ * Constructor of the TextHandlerBase base class. Initializes the start and
+ * end position with zeros.
+ */
+ TextHandlerBase() : textStart(0), textEnd(0) {}
+
+ /**
+ * Transforms the given token into a text token containing the extracted
+ * text.
+ *
+ * @param token is the output token to which the text should be written.
+ * @param sourceId is the source id of the underlying file.
+ */
+ void buildTextToken(TokenMatch &match, SourceId sourceId)
+ {
+ if (match.hasMatch()) {
+ match.token.content =
+ std::string{textBuf.data(), match.textLength};
+ match.token.location =
+ SourceLocation{sourceId, textStart, match.textEnd};
+ } else {
+ match.token.content = std::string{textBuf.data(), textBuf.size()};
+ match.token.location = SourceLocation{sourceId, textStart, textEnd};
+ }
+ match.token.type = TextToken;
+ }
+
+ /**
+ * Returns true if this whitespace handler has found any text and a text
+ * token could be emitted.
+ *
+ * @return true if the internal data buffer is non-empty.
+ */
+ bool hasText() { return !textBuf.empty(); }
+};
+
+/* Internal class PreservingTextHandler */
+
+/**
+ * The PreservingTextHandler class preserves all characters unmodified,
+ * including whitepace characters.
+ */
+class PreservingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
+ /**
+ * Appends the given character to the internal text buffer, does not
+ * eliminate whitespace.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+ textBuf.push_back(c);
+ }
+};
+
+/* Internal class TrimmingTextHandler */
+
/**
- * The TokenDescriptor class is a simple wrapper around a standard string
- * containing the character sequence of the token.
+ * The TrimmingTextHandler class trims all whitespace characters at the begin
+ * and the end of a text section but leaves all other characters unmodified,
+ * including whitepace characters.
*/
-class TokenDescriptor {
+class TrimmingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
+
/**
- * The character sequence of the token.
+ * Buffer used internally to temporarily store all whitespace characters.
+ * They are only added to the output buffer if another non-whitespace
+ * character is reached.
*/
- std::string str;
+ std::vector<char> whitespaceBuf;
/**
- * Default constructor of the TokenDescriptor class. Used to describe
- * special tokens.
+ * Appends the given character to the internal text buffer, eliminates
+ * whitespace characters at the begin and end of the text.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
*/
- TokenDescriptor();
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ whitespaceBuf.push_back(c);
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (!whitespaceBuf.empty()) {
+ textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
+ whitespaceBuf.end());
+ whitespaceBuf.clear();
+ }
+ textBuf.push_back(c);
+ }
+};
+
+/* Internal class CollapsingTextHandler */
+
+/**
+ * The CollapsingTextHandler trims characters at the beginning and end of the
+ * text and reduced multiple whitespace characters to a single blank.
+ */
+class CollapsingTextHandler : public TextHandlerBase {
+public:
+ using TextHandlerBase::TextHandlerBase;
/**
- * Constructor initializing the character sequence of the token.
+ * Flag set to true if a whitespace character was reached.
*/
- TokenDescriptor(const std::string &str) : str(str) {}
+ bool hasWhitespace = false;
+
+ /**
+ * Appends the given character to the internal text buffer, eliminates
+ * redundant whitespace characters.
+ *
+ * @param c is the character that should be appended to the internal buffer.
+ * @param start is the start byte offset of the given character.
+ * @param end is the end byte offset of the given character.
+ */
+ void append(char c, size_t start, size_t end)
+ {
+ // Handle whitespace characters
+ if (Utils::isWhitespace(c)) {
+ if (!textBuf.empty()) {
+ hasWhitespace = true;
+ }
+ return;
+ }
+
+ // Set the start and end offset correctly
+ if (textBuf.empty()) {
+ textStart = start;
+ }
+ textEnd = end;
+
+ // Store the character
+ if (hasWhitespace) {
+ textBuf.push_back(' ');
+ hasWhitespace = false;
+ }
+ textBuf.push_back(c);
+ }
};
+}
/* Class DynamicTokenizer */
-void DynamicTokenizer:setWhitespaceMode(WhitespaceMode mode)
+DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode)
+ : whitespaceMode(whitespaceMode), nextTokenTypeId(0)
{
- whitespaceMode = mode;
}
-WhitespaceMode DynamicTokenizer::getWhitespaceMode()
+template <typename TextHandler, bool read>
+bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token)
{
- return whitespaceMode;
+ // If we're in the read mode, reset the char reader peek position to the
+ // current read position
+ if (read) {
+ reader.resetPeek();
+ }
+
+ // Prepare the lookups in the token trie
+ const TokenTrie::Node *root = trie.getRoot();
+ TokenMatch match;
+ std::vector<TokenLookup> lookups;
+ std::vector<TokenLookup> nextLookups;
+
+ // Instantiate the text handler
+ TextHandler textHandler;
+
+ // Peek characters from the reader and try to advance the current token tree
+ // cursor
+ char c;
+ size_t charStart = reader.getPeekOffset();
+ const SourceId sourceId = reader.getSourceId();
+ while (reader.peek(c)) {
+ const size_t charEnd = reader.getPeekOffset();
+ const size_t textLength = textHandler.textBuf.size();
+ const size_t textEnd = textHandler.textEnd;
+
+ // If we do not have a match yet, start a new lookup from the root
+ if (!match.hasMatch()) {
+ TokenLookup{root, charStart, textLength, textEnd}.advance(
+ c, nextLookups, match, tokens, charEnd, sourceId);
+ }
+
+ // Try to advance all other lookups with the new character
+ for (TokenLookup &lookup : lookups) {
+ lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId);
+ }
+
+ // We have found a token and there are no more states to advance or the
+ // text handler has found something -- abort to return the new token
+ if (match.hasMatch()) {
+ if ((nextLookups.empty() || textHandler.hasText())) {
+ break;
+ }
+ } else {
+ // Record all incomming characters
+ textHandler.append(c, charStart, charEnd);
+ }
+
+ // Swap the lookups and the nextLookups list
+ lookups = std::move(nextLookups);
+ nextLookups.clear();
+
+ // Advance the offset
+ charStart = charEnd;
+ }
+
+ // If we found text, emit that text
+ if (textHandler.hasText() &&
+ (!match.hasMatch() || match.textLength > 0)) {
+ textHandler.buildTextToken(match, sourceId);
+ }
+
+ // Move the read/peek cursor to the end of the token, abort if an error
+ // happens while doing so
+ if (match.hasMatch()) {
+ // Make sure we have a valid location
+ if (match.token.location.getEnd() == InvalidSourceOffset) {
+ throw OusiaException{"Token end position offset out of range"};
+ }
+
+ // Seek to the end of the current token
+ const size_t end = match.token.location.getEnd();
+ if (read) {
+ reader.seek(end);
+ } else {
+ reader.seekPeekCursor(end);
+ }
+ token = match.token;
+ } else {
+ token = DynamicToken{};
+ }
+ return match.hasMatch();
+}
+
+bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token)
+{
+ switch (whitespaceMode) {
+ case WhitespaceMode::PRESERVE:
+ return next<PreservingTextHandler, true>(reader, token);
+ case WhitespaceMode::TRIM:
+ return next<TrimmingTextHandler, true>(reader, token);
+ case WhitespaceMode::COLLAPSE:
+ return next<CollapsingTextHandler, true>(reader, token);
+ }
+ return false;
+}
+
+bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token)
+{
+ switch (whitespaceMode) {
+ case WhitespaceMode::PRESERVE:
+ return next<PreservingTextHandler, false>(reader, token);
+ case WhitespaceMode::TRIM:
+ return next<TrimmingTextHandler, false>(reader, token);
+ case WhitespaceMode::COLLAPSE:
+ return next<CollapsingTextHandler, false>(reader, token);
+ }
+ return false;
}
+TokenTypeId DynamicTokenizer::registerToken(const std::string &token)
+{
+ // Abort if an empty token should be registered
+ if (token.empty()) {
+ return EmptyToken;
+ }
+
+ // Search for a new slot in the tokens list
+ TokenTypeId type = EmptyToken;
+ for (size_t i = nextTokenTypeId; i < tokens.size(); i++) {
+ if (tokens[i].empty()) {
+ tokens[i] = token;
+ type = i;
+ break;
+ }
+ }
-/* Constant initializations */
+ // No existing slot was found, add a new one -- make sure we do not
+ // override the special token type handles
+ if (type == EmptyToken) {
+ type = tokens.size();
+ if (type == TextToken || type == EmptyToken) {
+ throw OusiaException{"Token type ids depleted!"};
+ }
+ tokens.emplace_back(token);
+ }
+ nextTokenTypeId = type + 1;
-static const TokenDescriptor Empty;
-static const TokenDescriptor Text;
-static const TokenDescriptor* DynamicTokenizer::Empty = &Empty;
-static const TokenDescriptor* DynamicTokenizer::Token = &Text;
+ // Try to register the token in the trie -- if this fails, remove it
+ // from the tokens list
+ if (!trie.registerToken(token, type)) {
+ tokens[type] = std::string();
+ nextTokenTypeId = type;
+ return EmptyToken;
+ }
+ return type;
+}
+
+bool DynamicTokenizer::unregisterToken(TokenTypeId type)
+{
+ // Unregister the token from the trie, abort if an invalid type is given
+ if (type < tokens.size() && trie.unregisterToken(tokens[type])) {
+ tokens[type] = std::string{};
+ nextTokenTypeId = type;
+ return true;
+ }
+ return false;
+}
+
+std::string DynamicTokenizer::getTokenString(TokenTypeId type)
+{
+ if (type < tokens.size()) {
+ return tokens[type];
+ }
+ return std::string{};
+}
+
+void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode)
+{
+ whitespaceMode = mode;
+}
+WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; }
+/* Explicitly instantiate all possible instantiations of the "next" member
+ function */
+template bool DynamicTokenizer::next<PreservingTextHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, false>(
+ CharReader &reader, DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, false>(
+ CharReader &reader,DynamicToken &token);
+template bool DynamicTokenizer::next<PreservingTextHandler, true>(
+ CharReader &reader,DynamicToken &token);
+template bool DynamicTokenizer::next<TrimmingTextHandler, true>(
+ CharReader &reader,DynamicToken &token);
+template bool DynamicTokenizer::next<CollapsingTextHandler, true>(
+ CharReader &reader,DynamicToken &token);
}
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index f7fef13..0b4dd39 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -28,34 +28,63 @@
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#include <set>
+#include <string>
+#include <vector>
+
#include <core/common/Location.hpp>
+#include "TokenTrie.hpp"
+
namespace ousia {
// Forward declarations
class CharReader;
-class TokenDescriptor;
/**
* The DynamicToken structure describes a token discovered by the Tokenizer.
*/
struct DynamicToken {
/**
- * Pointer pointing at the TokenDescriptor instance this token corresponds
- * to. May be one of the special TokenDescriptors defined as static members
- * of the DynamicTokenizer class.
+ * Id of the type of this token.
*/
- TokenDescriptor const *descriptor;
+ TokenTypeId type;
/**
* String that was matched.
*/
- std::string str;
+ std::string content;
/**
* Location from which the string was extracted.
*/
SourceLocation location;
+
+ /**
+ * Default constructor.
+ */
+ DynamicToken() : type(EmptyToken) {}
+
+ /**
+ * Constructor of the DynamicToken struct.
+ *
+ * @param id represents the token type.
+ * @param content is the string content that has been extracted.
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ DynamicToken(TokenTypeId type, const std::string &content,
+ SourceLocation location)
+ : type(type), content(content), location(location)
+ {
+ }
+
+ /**
+ * Constructor of the DynamicToken struct, only initializes the token type
+ *
+ * @param type is the id corresponding to the type of the token.
+ */
+ DynamicToken(TokenTypeId type) : type(type) {}
};
/**
@@ -64,33 +93,35 @@ struct DynamicToken {
*/
enum class WhitespaceMode {
/**
- * Preserves all whitespaces as they are found in the source file.
- */
+ * Preserves all whitespaces as they are found in the source file.
+ */
PRESERVE,
/**
- * Trims whitespace at the beginning and the end of the found text.
- */
+ * Trims whitespace at the beginning and the end of the found text.
+ */
TRIM,
/**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
COLLAPSE
};
/**
* The DynamicTokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters.
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
*/
class DynamicTokenizer {
private:
/**
- * Reference at the char reader.
+ * Internally used token trie. This object holds all registered tokens.
*/
- CharReader &reader;
+ TokenTrie trie;
/**
* Flag defining whether whitespaces should be preserved or not.
@@ -98,53 +129,73 @@ private:
WhitespaceMode whitespaceMode;
/**
- * Vector containing all registered token descriptors.
+ * Vector containing all registered token types.
*/
- std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+ std::vector<std::string> tokens;
-public:
/**
- * Constructor of the DynamicTokenizer class.
+ * Next index in the tokens list where to search for a new token id.
+ */
+ size_t nextTokenTypeId;
+
+ /**
+ * Templated function used internally to read the current token. The
+ * function is templated in order to force code generation for all six
+ * combiations of whitespace modes and reading/peeking.
*
- * @param reader is the CharReader that should be used for reading the
- * tokens.
- * @param preserveWhitespaces should be set to true if all whitespaces
- * should be preserved (for preformated environments).
- */
- DynamicTokenizer(CharReader &reader)
- : reader(reader),
- preserveWhitespaces(preserveWhitespaces),
- location(reader.getSourceId()),
- empty(true),
- hasWhitespace(false)
- {
- }
+ * @tparam TextHandler is the type to be used for the textHandler instance.
+ * @tparam read specifies whether the function should start from and advance
+ * the read pointer of the char reader.
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is the token structure into which the token information
+ * should be written.
+ * @return false if the end of the stream has been reached, true otherwise.
+ */
+ template <typename TextHandler, bool read>
+ bool next(CharReader &reader, DynamicToken &token);
+public:
/**
- * Destructor of the DynamicTokenizer class.
+ * Constructor of the DynamicTokenizer class.
+ *
+ * @param whitespaceMode specifies how whitespace should be handled.
*/
- ~DynamicTokenizer();
+ DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* Registers the given string as a token. Returns a const pointer at a
* TokenDescriptor that will be used to reference the newly created token.
*
* @param token is the token string that should be registered.
- * @return a pointer at a TokenDescriptor which is representative for the
- * newly registered token. Returns nullptr if a token with this string
- * was already registered.
+ * @return a unique identifier for the registered token or EmptyToken if
+ * an error occured.
*/
- const TokenDescriptor* registerToken(const std::string &token);
+ TokenTypeId registerToken(const std::string &token);
/**
- * Unregisters the token belonging to the given TokenDescriptor.
+ * Unregisters the token belonging to the given TokenTypeId.
*
- * @param descr is a TokenDescriptor that was previously returned by
- * registerToken.
+ * @param type is the token type that should be unregistered. The
+ *TokenTypeId
+ * must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
* because the given TokenDescriptor was already unregistered).
*/
- bool unregisterToken(const TokenDescriptor *descr);
+ bool unregisterToken(TokenTypeId type);
+
+ /**
+ * Returns the token that was registered under the given TokenTypeId id or
+ *an
+ * empty string if an invalid TokenTypeId id is given.
+ *
+ * @param type is the TokenTypeId id for which the corresponding token
+ *string
+ * should be returned.
+ * @return the registered token string or an empty string if the given type
+ * was invalid.
+ */
+ std::string getTokenString(TokenTypeId type);
/**
* Sets the whitespace mode.
@@ -165,25 +216,28 @@ public:
* Reads a new token from the CharReader and stores it in the given
* DynamicToken instance.
*
+ * @param reader is the CharReader instance from which the data should be
+ * read.
* @param token is a reference at the token instance into which the Token
* information should be written.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool read(DynamicToken &token);
+ bool read(CharReader &reader, DynamicToken &token);
/**
- * TokenDescriptor representing an empty token.
- */
- static const *TokenDescriptor Empty;
-
- /**
- * TokenDescriptor representing generic text.
+ * The peek method does not advance the read position of the char reader,
+ * but reads the next token from the current char reader peek position.
+ *
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is a reference at the token instance into which the Token
+ * information should be written.
+ * @return true if a token could be read, false if the end of the stream
+ * has been reached.
*/
- static const *TokenDescriptor Text;
-
+ bool peek(CharReader &reader, DynamicToken &token);
};
-
}
#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */
diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
index 15ca403..498cd43 100644
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ b/src/plugins/plain/PlainFormatStreamReader.cpp
@@ -16,9 +16,6 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include <sstream>
-#include <unordered_set>
-
#include <core/common/CharReader.hpp>
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
@@ -27,123 +24,89 @@
namespace ousia {
-/* Internally used types, protected from spilling the exports by a namespace */
-
namespace {
-/**
- * Enum used to specify the state of the parseBlockComment state machine.
- */
-enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT };
/**
- * Class taking care of recording plain text data found withing the file.
+ * Class used internally to collect data issued via "DATA" event.
*/
class DataHandler {
private:
/**
- * Const reference at the reader, used for reading the current location.
+ * Internal character buffer.
*/
- const CharReader &reader;
+ std::vector<char> buf;
/**
- * Flag defining whether whitespaces should be preserved or not.
+ * Start location of the character data.
*/
- const bool preserveWhitespaces;
+ SourceOffset start;
/**
- * Current source range of the data in the buffer.
+ * End location of the character data.
*/
- SourceLocation location;
+ SourceOffset end;
- /**
- * Current buffer containing all read characters.
- */
- std::stringstream buffer;
+public:
/**
- * Set to false, once a non-whitespace character was reached.
+ * Default constructor, initializes start and end with zeros.
*/
- bool empty;
+ DataHandler() : start(0), end(0) {}
/**
- * Set to true if a whitespace was found -- these are normalized to a single
- * space.
+ * Returns true if the internal buffer is empty.
+ *
+ * @return true if no characters were added to the internal buffer, false
+ * otherwise.
*/
- bool hasWhitespace;
+ bool isEmpty() { return buf.empty(); }
-public:
/**
- * Constructor of the DataHandler class.
+ * Appends a single character to the internal buffer.
*
- * @param reader is the CharReader that should be used for reading the data
- * location.
- * @param preserveWhitespaces should be set to true if all whitespaces
- * should be preserved (for preformated environments).
+ * @param c is the character that should be added to the internal buffer.
+ * @param charStart is the start position of the character.
+ * @param charEnd is the end position of the character.
*/
- DataHandler(const CharReader &reader, bool preserveWhitespaces = false)
- : reader(reader),
- preserveWhitespaces(preserveWhitespaces),
- location(reader.getSourceId()),
- empty(true),
- hasWhitespace(false)
+ void append(char c, SourceOffset charStart, SourceOffset charEnd)
{
+ if (isEmpty()) {
+ start = charStart;
+ }
+ buf.push_back(c);
+ end = charEnd;
}
/**
- * Appends the given character to the internal buffer.
+ * Appends a string to the internal buffer.
*
- * @param c is the character that should be appended.
- * @param wasEscaped is set to true if the character was escaped (prepended
- * with a backslash), this allows whitespace characters to be explicitly
- * included.
+ * @param s is the string that should be added to the internal buffer.
+ * @param stringStart is the start position of the string.
+ * @param stringEnd is the end position of the string.
*/
- void append(char c, bool wasEscaped = false)
+ void append(const std::string &s, SourceOffset stringStart,
+ SourceOffset stringEnd)
{
- // Check whether the character is a whitespace
- const bool isWhitespace =
- !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c);
-
- // Trim leading and trailing whitespaces
- if (isWhitespace) {
- if (!empty) {
- hasWhitespace = true;
- }
- } else {
- // Compress whitespaces to a single space
- if (hasWhitespace) {
- buffer << ' ';
- hasWhitespace = false;
- }
-
- // Append the character
- buffer << c;
-
- // Update the "empty" flag and set the start and end offset
- if (empty) {
- location.setStart(reader.getOffset());
- empty = false;
- }
- location.setEnd(reader.getPeekOffset());
+ if (isEmpty()) {
+ start = stringStart;
}
+ std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+ end = stringEnd;
}
/**
- * Returns true if no non-whitespace character has been found until now.
- *
- * @return true if the internal buffer is still empty.
- */
- bool isEmpty() { return empty; }
-
- /**
- * Returns a variant containg the read data and its location.
+ * Converts the internal buffer to a variant with attached location
+ * information.
*
- * @return a variant with a string value containing the read data and the
- * location being set to
+ * @param sourceId is the source id which is needed for building the
+ * location information.
+ * @return a Variant with the internal buffer content as string and
+ * the correct start and end location.
*/
- Variant getData()
+ Variant toVariant(SourceId sourceId)
{
- Variant res = Variant::fromString(buffer.str());
- res.setLocation(location);
+ Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+ res.setLocation({sourceId, start, end});
return res;
}
};
@@ -153,35 +116,26 @@ PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,
Logger &logger)
: reader(reader), logger(logger), fieldIdx(0)
{
+ tokenBackslash = tokenizer.registerToken("\\");
+ tokenLinebreak = tokenizer.registerToken("\n");
+ tokenLineComment = tokenizer.registerToken("%");
+ tokenBlockCommentStart = tokenizer.registerToken("%{");
+ tokenBlockCommentEnd = tokenizer.registerToken("}%");
}
-/* Comment handling */
-
void PlainFormatStreamReader::parseBlockComment()
{
- char c;
- BlockCommentState state = BlockCommentState::DEFAULT;
- while (reader.read(c)) {
- switch (state) {
- case BlockCommentState::DEFAULT:
- if (c == '%') {
- state = BlockCommentState::HAS_PERCENT;
- } else if (c == '}') {
- state = BlockCommentState::HAS_CURLY_CLOSE;
- }
- break;
- case BlockCommentState::HAS_PERCENT:
- if (c == '{') {
- parseBlockComment();
- }
- state = BlockCommentState::DEFAULT;
- break;
- case BlockCommentState::HAS_CURLY_CLOSE:
- if (c == '%') {
- return;
- }
- state = BlockCommentState::DEFAULT;
- break;
+ DynamicToken token;
+ size_t depth = 1;
+ while (tokenizer.read(reader, token)) {
+ if (token.type == tokenBlockCommentEnd) {
+ depth--;
+ if (depth == 0) {
+ return;
+ }
+ }
+ if (token.type == tokenBlockCommentStart) {
+ depth++;
}
}
@@ -189,102 +143,84 @@ void PlainFormatStreamReader::parseBlockComment()
logger.error("File ended while being in a block comment", reader);
}
-void PlainFormatStreamReader::parseComment()
+void PlainFormatStreamReader::parseLineComment()
{
char c;
- bool first = true;
reader.consumePeek();
while (reader.read(c)) {
- // Continue parsing a block comment if a '{' is found
- if (c == '{' && first) {
- parseBlockComment();
- return;
- }
if (c == '\n') {
return;
}
- first = false;
}
}
-/* Top level parse function */
-
-static const std::unordered_set<char> EscapeableCharacters{'\\', '<', '>',
- '{', '}', '%'};
-
PlainFormatStreamReader::State PlainFormatStreamReader::parse()
{
// Macro (sorry for that) used for checking whether there is data to issue, and
// if yes, aborting the loop, allowing for a reentry on a later parse call by
// resetting the peek cursor
-#define CHECK_ISSUE_DATA() \
- { \
- if (!dataHandler.isEmpty()) { \
- reader.resetPeek(); \
- abort = true; \
- break; \
- } \
+#define CHECK_ISSUE_DATA() \
+ { \
+ if (!dataHandler.isEmpty()) { \
+ reader.resetPeek(); \
+ abort = true; \
+ break; \
+ } \
}
- // Data handler
- DataHandler dataHandler(reader);
+ // Handler for incomming data
+ DataHandler dataHandler;
// Variable set to true if the parser loop should be left
bool abort = false;
- // Happily add characters to the dataHandler and handle escaping until a
- // special character is reached. Then go to a specialiced parsing routine
- char c;
- while (!abort && reader.peek(c)) {
- switch (c) {
- case '\\':
- reader.peek(c);
- // Check whether this backslash just escaped some special or
- // whitespace character or was the beginning of a command
- if (EscapeableCharacters.count(c) == 0 &&
- !Utils::isWhitespace(c)) {
- CHECK_ISSUE_DATA();
- // TODO: Parse command (starting from the backslash)
- return State::COMMAND;
- }
- // A character was escaped, add it to the buffer, with the
- // wasEscaped flag set to true
- dataHandler.append(c, true);
- break;
- case '<':
- // TODO: Annotations
- break;
- case '>':
- // TODO: Annotations
- break;
- case '{':
- // TODO: Issue start of field
- break;
- case '}':
- // TODO: Issue end of field
- case '%':
- CHECK_ISSUE_DATA();
- parseComment();
- break;
- case '\n':
+ // Read tokens until the outer loop should be left
+ DynamicToken token;
+ while (!abort && tokenizer.peek(reader, token)) {
+ // Check whether this backslash just escaped some special or
+ // whitespace character or was the beginning of a command
+ if (token.type == tokenBackslash) {
+ // Check whether this character could be the start of a command
+ char c;
+ reader.consumePeek();
+ reader.peek(c);
+ if (Utils::isIdentifierStart(c)) {
CHECK_ISSUE_DATA();
- reader.consumePeek();
- return State::LINEBREAK;
- default:
- dataHandler.append(c, false);
+ // TODO: Parse a command
+ return State::COMMAND;
+ }
+
+ // This was not a special character, just append the given character
+ // to the data buffer, use the escape character start as start
+ // location and the peek offset as end location
+ dataHandler.append(c, token.location.getStart(),
+ reader.getPeekOffset());
+ } else if (token.type == tokenLineComment) {
+ CHECK_ISSUE_DATA();
+ reader.consumePeek();
+ parseLineComment();
+ } else if (token.type == tokenBlockCommentStart) {
+ CHECK_ISSUE_DATA();
+ reader.consumePeek();
+ parseBlockComment();
+ } else if (token.type == tokenLinebreak) {
+ CHECK_ISSUE_DATA();
+ reader.consumePeek();
+ return State::LINEBREAK;
+ } else if (token.type == TextToken) {
+ dataHandler.append(token.content, token.location.getStart(),
+ token.location.getEnd());
}
// Consume the peeked character if we did not abort, otherwise abort
if (!abort) {
reader.consumePeek();
- } else {
- break;
}
}
// Send out pending output data, otherwise we are at the end of the stream
if (!dataHandler.isEmpty()) {
- data = dataHandler.getData();
+ data = dataHandler.toVariant(reader.getSourceId());
return State::DATA;
}
return State::END;
diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp
index 1a136cd..b2ea378 100644
--- a/src/plugins/plain/PlainFormatStreamReader.hpp
+++ b/src/plugins/plain/PlainFormatStreamReader.hpp
@@ -31,6 +31,8 @@
#include <core/common/Variant.hpp>
+#include "DynamicTokenizer.hpp"
+
namespace ousia {
// Forward declarations
@@ -123,6 +125,11 @@ private:
Logger &logger;
/**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ DynamicTokenizer tokenizer;
+
+ /**
* Variant containing the current command name (always is a string variant,
* but additionally contains the correct locatino of the name).
*/
@@ -141,6 +148,31 @@ private:
Variant data;
/**
+ * Id of the backslash token.
+ */
+ TokenTypeId tokenBackslash;
+
+ /**
+ * Id of the linebreak token.
+ */
+ TokenTypeId tokenLinebreak;
+
+ /**
+ * Id of the line comment token.
+ */
+ TokenTypeId tokenLineComment;
+
+ /**
+ * Id of the block comment start token.
+ */
+ TokenTypeId tokenBlockCommentStart;
+
+ /**
+ * If of the block comment end token.
+ */
+ TokenTypeId tokenBlockCommentEnd;
+
+ /**
* Contains the field index of the current command.
*/
size_t fieldIdx;
@@ -153,7 +185,7 @@ private:
/**
* Function used internally to parse a generic comment.
*/
- void parseComment();
+ void parseLineComment();
public:
/**
diff --git a/src/plugins/plain/DynamicTokenTree.cpp b/src/plugins/plain/TokenTrie.cpp
index 8b7bfc2..4a0430b 100644
--- a/src/plugins/plain/DynamicTokenTree.cpp
+++ b/src/plugins/plain/TokenTrie.cpp
@@ -16,18 +16,18 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include "DynamicTokenTree.hpp"
+#include "TokenTrie.hpp"
namespace ousia {
/* Class DynamicTokenTree::Node */
-DynamicTokenTree::Node::Node() : descriptor(nullptr) {}
+TokenTrie::Node::Node() : type(EmptyToken) {}
/* Class DynamicTokenTree */
-bool DynamicTokenTree::registerToken(const std::string &token,
- const TokenDescriptor *descriptor) noexcept
+bool TokenTrie::registerToken(const std::string &token,
+ TokenTypeId type) noexcept
{
// Abort if the token is empty -- this would taint the root node
if (token.empty()) {
@@ -42,23 +42,22 @@ bool DynamicTokenTree::registerToken(const std::string &token,
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- it = node->children.emplace(c, std::unique_ptr<Node>(new Node{}))
- .first;
+ it = node->children.emplace(c, std::make_shared<Node>()).first;
}
node = it->second.get();
}
- // If the resulting node already has a descriptor set, we're screwed.
- if (node->descriptor != nullptr) {
+ // If the resulting node already has a type set, we're screwed.
+ if (node->type != EmptyToken) {
return false;
}
- // Otherwise just set the descriptor to the given descriptor.
- node->descriptor = descriptor;
+ // Otherwise just set the type to the given type.
+ node->type = type;
return true;
}
-bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
+bool TokenTrie::unregisterToken(const std::string &token) noexcept
{
// We cannot remove empty tokens as we need to access the fist character
// upfront
@@ -77,24 +76,24 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
return false;
}
- // Reset the subtree handler if this node has another descriptor
+ // Reset the subtree handler if this node has another type
node = it->second.get();
- if ((node->descriptor != nullptr || node->children.size() > 1) &&
+ if ((node->type != EmptyToken || node->children.size() > 1) &&
(i + 1 != token.size())) {
subtreeRoot = node;
subtreeKey = token[i + 1];
}
}
- // If the node descriptor is already nullptr, we cannot do anything here
- if (node->descriptor == nullptr) {
+ // If the node type is already EmptyToken, we cannot do anything here
+ if (node->type == EmptyToken) {
return false;
}
// If the target node has children, we cannot delete the subtree. Set the
- // descriptor to nullptr instead
+ // type to EmptyToken instead
if (!node->children.empty()) {
- node->descriptor = nullptr;
+ node->type = EmptyToken;
return true;
}
@@ -103,19 +102,18 @@ bool DynamicTokenTree::unregisterToken(const std::string &token) noexcept
return true;
}
-const TokenDescriptor *DynamicTokenTree::hasToken(
- const std::string &token) const noexcept
+TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept
{
Node const *node = &root;
for (size_t i = 0; i < token.size(); i++) {
const char c = token[i];
auto it = node->children.find(c);
if (it == node->children.end()) {
- return nullptr;
+ return EmptyToken;
}
node = it->second.get();
}
- return node->descriptor;
+ return node->type;
}
}
diff --git a/src/plugins/plain/DynamicTokenTree.hpp b/src/plugins/plain/TokenTrie.hpp
index c5dc4de..36c2ffa 100644
--- a/src/plugins/plain/DynamicTokenTree.hpp
+++ b/src/plugins/plain/TokenTrie.hpp
@@ -17,54 +17,61 @@
*/
/**
- * @file DynamicTokenTree.hpp
+ * @file TokenTrie.hpp
*
- * Class representing a token tree that can be updated dynamically.
+ * Class representing a token trie that can be updated dynamically.
*
* @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de)
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
-#ifndef _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
-#define _OUSIA_DYNAMIC_TOKEN_TREE_HPP_
+#ifndef _OUSIA_TOKEN_TRIE_HPP_
+#define _OUSIA_TOKEN_TRIE_HPP_
+#include <cstdint>
#include <memory>
+#include <limits>
#include <unordered_map>
namespace ousia {
-class TokenDescriptor;
+/**
+ * The TokenTypeId is used to give each token type a unique id.
+ */
+using TokenTypeId = uint32_t;
/**
- * The Tokenizer internally uses a DynamicTokenTree to be efficiently able to
- * identify the longest consecutive token in the text. This is equivalent to a
- * prefix trie.
+ * Token which is not a token.
+ */
+constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max();
+
+/**
+ * Token which represents a text token.
+ */
+constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1;
+
+/**
+ * The Tokenizer internally uses a TokenTrie to be efficiently able to identify
+ * the longest consecutive token in the text. This is equivalent to a prefix
+ * trie.
*
- * A token tree is a construct that structures all special tokens a
- * Tokenizer recognizes. Consider the tokens "aab", "a" and "aac". Then
- * the token tree would look like this:
+ * A token trie is a construct that structures all special tokens a Tokenizer
+ * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and
+ * three. Then the token tree would look like this:
*
* \code{*.txt}
- * a
- * | \
- * a $
- * | \
- * b c
- * | |
- * $ $
+ * ~ (0)
+ * / \
+ * a (2) b (0)
+ * | |
+ * a (0) a (0)
+ * | |
+ * b (1) c (0)
* \endcode
*
- * Every node in the token tree is a valid end state that has a $ attached to
- * it. During the search algorithm the Tokenizer goes through the tree and
- * stores the last valid position. If a character follows that does not lead to
- * a new node in the TokenTree the search ends (and starts again at this
- * character). The token corresponding to the last valid position is returned.
- *
- * This allows us to uniquely identify the matching token given a certain
- * input text. Note that this is a greedy matching approach that does not
- * work if you're using truly ambiguous tokens (that have the same text).
+ * Where the number indicates the corresponding token descriptor identifier.
*/
-class DynamicTokenTree {
+class TokenTrie {
public:
/**
* Structure used to build the node tree.
@@ -73,7 +80,7 @@ public:
/**
* Type used for the child map.
*/
- using ChildMap = std::unordered_map<char, std::unique_ptr<Node>>;
+ using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>;
/**
* Map from single characters at the corresponding child nodes.
@@ -84,7 +91,7 @@ public:
* Reference at the corresponding token descriptor. Set to nullptr if
* no token is attached to this node.
*/
- TokenDescriptor const *descriptor;
+ TokenTypeId type;
/**
* Default constructor, initializes the descriptor with nullptr.
@@ -105,11 +112,10 @@ public:
*
* @param token is the character sequence that should be registered as
* token.
- * @param descriptor is the descriptor that should be set for this token.
+ * @param type is the descriptor that should be set for this token.
* @return true if the operation is successful, false otherwise.
*/
- bool registerToken(const std::string &token,
- const TokenDescriptor *descriptor) noexcept;
+ bool registerToken(const std::string &token, TokenTypeId type) noexcept;
/**
* Unregisters the token from the token tree. Returns true if the token was
@@ -128,9 +134,17 @@ public:
* @return the attached token descriptor or nullptr if the given token is
* not found.
*/
- const TokenDescriptor* hasToken(const std::string &token) const noexcept;
+ TokenTypeId hasToken(const std::string &token) const noexcept;
+
+ /**
+ * Returns a reference at the root node to be used for traversing the token
+ * tree.
+ *
+ * @return a reference at the root node.
+ */
+ const Node *getRoot() const noexcept { return &root; }
};
}
-#endif /* _OUSIA_DYNAMIC_TOKEN_TREE_HPP_ */
+#endif /* _OUSIA_TOKEN_TRIE_HPP_ */