summaryrefslogtreecommitdiff
path: root/src/plugins/plain/DynamicTokenizer.hpp
diff options
context:
space:
mode:
authorBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2015-02-08 19:49:17 +0100
committerBenjamin Paassen <bpaassen@techfak.uni-bielefeld.de>2015-02-08 19:49:17 +0100
commit9ef316ed4ea8542973d272fa9c7b4c6804b28144 (patch)
tree1e884c61b5915f913c8db404cc9137bbe8eae01c /src/plugins/plain/DynamicTokenizer.hpp
parent05e5a4ab340d0f9f3490e7db9c8e42f70cc471da (diff)
parentf6e7859a835375c25226719a46df99ec11037599 (diff)
Merge branch 'master' of somweyr.de:ousia
Diffstat (limited to 'src/plugins/plain/DynamicTokenizer.hpp')
-rw-r--r--src/plugins/plain/DynamicTokenizer.hpp160
1 files changed, 107 insertions, 53 deletions
diff --git a/src/plugins/plain/DynamicTokenizer.hpp b/src/plugins/plain/DynamicTokenizer.hpp
index f7fef13..0b4dd39 100644
--- a/src/plugins/plain/DynamicTokenizer.hpp
+++ b/src/plugins/plain/DynamicTokenizer.hpp
@@ -28,34 +28,63 @@
#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_
#define _OUSIA_DYNAMIC_TOKENIZER_HPP_
+#include <set>
+#include <string>
+#include <vector>
+
#include <core/common/Location.hpp>
+#include "TokenTrie.hpp"
+
namespace ousia {
// Forward declarations
class CharReader;
-class TokenDescriptor;
/**
* The DynamicToken structure describes a token discovered by the Tokenizer.
*/
struct DynamicToken {
/**
- * Pointer pointing at the TokenDescriptor instance this token corresponds
- * to. May be one of the special TokenDescriptors defined as static members
- * of the DynamicTokenizer class.
+ * Id of the type of this token.
*/
- TokenDescriptor const *descriptor;
+ TokenTypeId type;
/**
* String that was matched.
*/
- std::string str;
+ std::string content;
/**
* Location from which the string was extracted.
*/
SourceLocation location;
+
+ /**
+ * Default constructor.
+ */
+ DynamicToken() : type(EmptyToken) {}
+
+ /**
+ * Constructor of the DynamicToken struct.
+ *
+ * @param id represents the token type.
+ * @param content is the string content that has been extracted.
+ * @param location is the location of the extracted string content in the
+ * source file.
+ */
+ DynamicToken(TokenTypeId type, const std::string &content,
+ SourceLocation location)
+ : type(type), content(content), location(location)
+ {
+ }
+
+ /**
+ * Constructor of the DynamicToken struct, only initializes the token type
+ *
+ * @param type is the id corresponding to the type of the token.
+ */
+ DynamicToken(TokenTypeId type) : type(type) {}
};
/**
@@ -64,33 +93,35 @@ struct DynamicToken {
*/
enum class WhitespaceMode {
/**
- * Preserves all whitespaces as they are found in the source file.
- */
+ * Preserves all whitespaces as they are found in the source file.
+ */
PRESERVE,
/**
- * Trims whitespace at the beginning and the end of the found text.
- */
+ * Trims whitespace at the beginning and the end of the found text.
+ */
TRIM,
/**
- * Whitespaces are trimmed and collapsed, multiple whitespace characters
- * are replaced by a single space character.
- */
+ * Whitespaces are trimmed and collapsed, multiple whitespace characters
+ * are replaced by a single space character.
+ */
COLLAPSE
};
/**
* The DynamicTokenizer is used to extract tokens and chunks of text from a
* CharReader. It allows to register and unregister tokens while parsing and
- * to modify the handling of whitespace characters.
+ * to modify the handling of whitespace characters. Note that the
+ * DynamicTokenizer always tries to extract the longest possible token from the
+ * tokenizer.
*/
class DynamicTokenizer {
private:
/**
- * Reference at the char reader.
+ * Internally used token trie. This object holds all registered tokens.
*/
- CharReader &reader;
+ TokenTrie trie;
/**
* Flag defining whether whitespaces should be preserved or not.
@@ -98,53 +129,73 @@ private:
WhitespaceMode whitespaceMode;
/**
- * Vector containing all registered token descriptors.
+ * Vector containing all registered token types.
*/
- std::vector<std::unique_ptr<TokenDescriptor>> descriptors;
+ std::vector<std::string> tokens;
-public:
/**
- * Constructor of the DynamicTokenizer class.
+ * Next index in the tokens list where to search for a new token id.
+ */
+ size_t nextTokenTypeId;
+
+ /**
+ * Templated function used internally to read the current token. The
+ * function is templated in order to force code generation for all six
+ * combiations of whitespace modes and reading/peeking.
*
- * @param reader is the CharReader that should be used for reading the
- * tokens.
- * @param preserveWhitespaces should be set to true if all whitespaces
- * should be preserved (for preformated environments).
- */
- DynamicTokenizer(CharReader &reader)
- : reader(reader),
- preserveWhitespaces(preserveWhitespaces),
- location(reader.getSourceId()),
- empty(true),
- hasWhitespace(false)
- {
- }
+ * @tparam TextHandler is the type to be used for the textHandler instance.
+ * @tparam read specifies whether the function should start from and advance
+ * the read pointer of the char reader.
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is the token structure into which the token information
+ * should be written.
+ * @return false if the end of the stream has been reached, true otherwise.
+ */
+ template <typename TextHandler, bool read>
+ bool next(CharReader &reader, DynamicToken &token);
+public:
/**
- * Destructor of the DynamicTokenizer class.
+ * Constructor of the DynamicTokenizer class.
+ *
+ * @param whitespaceMode specifies how whitespace should be handled.
*/
- ~DynamicTokenizer();
+ DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE);
/**
* Registers the given string as a token. Returns a const pointer at a
* TokenDescriptor that will be used to reference the newly created token.
*
* @param token is the token string that should be registered.
- * @return a pointer at a TokenDescriptor which is representative for the
- * newly registered token. Returns nullptr if a token with this string
- * was already registered.
+ * @return a unique identifier for the registered token or EmptyToken if
+ * an error occured.
*/
- const TokenDescriptor* registerToken(const std::string &token);
+ TokenTypeId registerToken(const std::string &token);
/**
- * Unregisters the token belonging to the given TokenDescriptor.
+ * Unregisters the token belonging to the given TokenTypeId.
*
- * @param descr is a TokenDescriptor that was previously returned by
- * registerToken.
+ * @param type is the token type that should be unregistered. The
+ *TokenTypeId
+ * must have been returned by registerToken.
* @return true if the operation was successful, false otherwise (e.g.
* because the given TokenDescriptor was already unregistered).
*/
- bool unregisterToken(const TokenDescriptor *descr);
+ bool unregisterToken(TokenTypeId type);
+
+ /**
+ * Returns the token that was registered under the given TokenTypeId id or
+ *an
+ * empty string if an invalid TokenTypeId id is given.
+ *
+ * @param type is the TokenTypeId id for which the corresponding token
+ *string
+ * should be returned.
+ * @return the registered token string or an empty string if the given type
+ * was invalid.
+ */
+ std::string getTokenString(TokenTypeId type);
/**
* Sets the whitespace mode.
@@ -165,25 +216,28 @@ public:
* Reads a new token from the CharReader and stores it in the given
* DynamicToken instance.
*
+ * @param reader is the CharReader instance from which the data should be
+ * read.
* @param token is a reference at the token instance into which the Token
* information should be written.
* @return true if a token could be read, false if the end of the stream
* has been reached.
*/
- bool read(DynamicToken &token);
+ bool read(CharReader &reader, DynamicToken &token);
/**
- * TokenDescriptor representing an empty token.
- */
- static const *TokenDescriptor Empty;
-
- /**
- * TokenDescriptor representing generic text.
+ * The peek method does not advance the read position of the char reader,
+ * but reads the next token from the current char reader peek position.
+ *
+ * @param reader is the CharReader instance from which the data should be
+ * read.
+ * @param token is a reference at the token instance into which the Token
+ * information should be written.
+ * @return true if a token could be read, false if the end of the stream
+ * has been reached.
*/
- static const *TokenDescriptor Text;
-
+ bool peek(CharReader &reader, DynamicToken &token);
};
-
}
#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */