summaryrefslogtreecommitdiff
path: root/src/plugins
diff options
context:
space:
mode:
authorAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-08 18:49:02 +0100
committerAndreas Stöckel <astoecke@techfak.uni-bielefeld.de>2015-02-08 18:49:02 +0100
commit51f09f4faa7cd4b6a0576758881d322e31e896ba (patch)
tree74660d05494ed41a788fcb5d6c7efd8a5407d57c /src/plugins
parentf066b4887f6f2896fe602f14ede9c02a9f5a7e1a (diff)
Ported PlainFormatStreamReader to DynamicTokenizer
Diffstat (limited to 'src/plugins')
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.cpp279
-rw-r--r--src/plugins/plain/PlainFormatStreamReader.hpp34
2 files changed, 116 insertions, 197 deletions
diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp
index 15ca403..f0721a0 100644
--- a/src/plugins/plain/PlainFormatStreamReader.cpp
+++ b/src/plugins/plain/PlainFormatStreamReader.cpp
@@ -16,9 +16,6 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include <sstream>
-#include <unordered_set>
-
#include <core/common/CharReader.hpp>
#include <core/common/Logger.hpp>
#include <core/common/Utils.hpp>
@@ -27,123 +24,40 @@
namespace ousia {
-/* Internally used types, protected from spilling the exports by a namespace */
-
namespace {
-/**
- * Enum used to specify the state of the parseBlockComment state machine.
- */
-enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT };
-
-/**
- * Class taking care of recording plain text data found withing the file.
- */
-class DataHandler {
-private:
- /**
- * Const reference at the reader, used for reading the current location.
- */
- const CharReader &reader;
-
- /**
- * Flag defining whether whitespaces should be preserved or not.
- */
- const bool preserveWhitespaces;
+struct DataHandler {
+ std::vector<char> buf;
- /**
- * Current source range of the data in the buffer.
- */
- SourceLocation location;
+ SourceOffset start;
+ SourceOffset end;
- /**
- * Current buffer containing all read characters.
- */
- std::stringstream buffer;
+ DataHandler() : start(0), end(0) {}
- /**
- * Set to false, once a non-whitespace character was reached.
- */
- bool empty;
+ bool isEmpty() { return buf.empty(); }
- /**
- * Set to true if a whitespace was found -- these are normalized to a single
- * space.
- */
- bool hasWhitespace;
-
-public:
- /**
- * Constructor of the DataHandler class.
- *
- * @param reader is the CharReader that should be used for reading the data
- * location.
- * @param preserveWhitespaces should be set to true if all whitespaces
- * should be preserved (for preformated environments).
- */
- DataHandler(const CharReader &reader, bool preserveWhitespaces = false)
- : reader(reader),
- preserveWhitespaces(preserveWhitespaces),
- location(reader.getSourceId()),
- empty(true),
- hasWhitespace(false)
+ void append(char c, SourceOffset charStart, SourceOffset charEnd)
{
+ if (isEmpty()) {
+ start = charStart;
+ }
+ buf.push_back(c);
+ end = charEnd;
}
- /**
- * Appends the given character to the internal buffer.
- *
- * @param c is the character that should be appended.
- * @param wasEscaped is set to true if the character was escaped (prepended
- * with a backslash), this allows whitespace characters to be explicitly
- * included.
- */
- void append(char c, bool wasEscaped = false)
+ void append(const std::string &s, SourceOffset stringStart,
+ SourceOffset stringEnd)
{
- // Check whether the character is a whitespace
- const bool isWhitespace =
- !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c);
-
- // Trim leading and trailing whitespaces
- if (isWhitespace) {
- if (!empty) {
- hasWhitespace = true;
- }
- } else {
- // Compress whitespaces to a single space
- if (hasWhitespace) {
- buffer << ' ';
- hasWhitespace = false;
- }
-
- // Append the character
- buffer << c;
-
- // Update the "empty" flag and set the start and end offset
- if (empty) {
- location.setStart(reader.getOffset());
- empty = false;
- }
- location.setEnd(reader.getPeekOffset());
+ if (isEmpty()) {
+ start = stringStart;
}
+ std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
+ end = stringEnd;
}
- /**
- * Returns true if no non-whitespace character has been found until now.
- *
- * @return true if the internal buffer is still empty.
- */
- bool isEmpty() { return empty; }
-
- /**
- * Returns a variant containg the read data and its location.
- *
- * @return a variant with a string value containing the read data and the
- * location being set to
- */
- Variant getData()
+ Variant toVariant(SourceId sourceId)
{
- Variant res = Variant::fromString(buffer.str());
- res.setLocation(location);
+ Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
+ res.setLocation({sourceId, start, end});
return res;
}
};
@@ -153,35 +67,26 @@ PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader,
Logger &logger)
: reader(reader), logger(logger), fieldIdx(0)
{
+ tokenBackslash = tokenizer.registerToken("\\");
+ tokenLinebreak = tokenizer.registerToken("\n");
+ tokenLineComment = tokenizer.registerToken("%");
+ tokenBlockCommentStart = tokenizer.registerToken("%{");
+ tokenBlockCommentEnd = tokenizer.registerToken("}%");
}
-/* Comment handling */
-
void PlainFormatStreamReader::parseBlockComment()
{
- char c;
- BlockCommentState state = BlockCommentState::DEFAULT;
- while (reader.read(c)) {
- switch (state) {
- case BlockCommentState::DEFAULT:
- if (c == '%') {
- state = BlockCommentState::HAS_PERCENT;
- } else if (c == '}') {
- state = BlockCommentState::HAS_CURLY_CLOSE;
- }
- break;
- case BlockCommentState::HAS_PERCENT:
- if (c == '{') {
- parseBlockComment();
- }
- state = BlockCommentState::DEFAULT;
- break;
- case BlockCommentState::HAS_CURLY_CLOSE:
- if (c == '%') {
- return;
- }
- state = BlockCommentState::DEFAULT;
- break;
+ DynamicToken token;
+ size_t depth = 1;
+ while (tokenizer.read(reader, token)) {
+ if (token.type == tokenBlockCommentEnd) {
+ depth--;
+ if (depth == 0) {
+ return;
+ }
+ }
+ if (token.type == tokenBlockCommentStart) {
+ depth++;
}
}
@@ -189,102 +94,84 @@ void PlainFormatStreamReader::parseBlockComment()
logger.error("File ended while being in a block comment", reader);
}
-void PlainFormatStreamReader::parseComment()
+void PlainFormatStreamReader::parseLineComment()
{
char c;
- bool first = true;
reader.consumePeek();
while (reader.read(c)) {
- // Continue parsing a block comment if a '{' is found
- if (c == '{' && first) {
- parseBlockComment();
- return;
- }
if (c == '\n') {
return;
}
- first = false;
}
}
-/* Top level parse function */
-
-static const std::unordered_set<char> EscapeableCharacters{'\\', '<', '>',
- '{', '}', '%'};
-
PlainFormatStreamReader::State PlainFormatStreamReader::parse()
{
// Macro (sorry for that) used for checking whether there is data to issue, and
// if yes, aborting the loop, allowing for a reentry on a later parse call by
// resetting the peek cursor
-#define CHECK_ISSUE_DATA() \
- { \
- if (!dataHandler.isEmpty()) { \
- reader.resetPeek(); \
- abort = true; \
- break; \
- } \
+#define CHECK_ISSUE_DATA() \
+ { \
+ if (!dataHandler.isEmpty()) { \
+ reader.resetPeek(); \
+ abort = true; \
+ break; \
+ } \
}
- // Data handler
- DataHandler dataHandler(reader);
+ // Handler for incomming data
+ DataHandler dataHandler;
// Variable set to true if the parser loop should be left
bool abort = false;
- // Happily add characters to the dataHandler and handle escaping until a
- // special character is reached. Then go to a specialiced parsing routine
- char c;
- while (!abort && reader.peek(c)) {
- switch (c) {
- case '\\':
- reader.peek(c);
- // Check whether this backslash just escaped some special or
- // whitespace character or was the beginning of a command
- if (EscapeableCharacters.count(c) == 0 &&
- !Utils::isWhitespace(c)) {
- CHECK_ISSUE_DATA();
- // TODO: Parse command (starting from the backslash)
- return State::COMMAND;
- }
- // A character was escaped, add it to the buffer, with the
- // wasEscaped flag set to true
- dataHandler.append(c, true);
- break;
- case '<':
- // TODO: Annotations
- break;
- case '>':
- // TODO: Annotations
- break;
- case '{':
- // TODO: Issue start of field
- break;
- case '}':
- // TODO: Issue end of field
- case '%':
- CHECK_ISSUE_DATA();
- parseComment();
- break;
- case '\n':
+ // Read tokens until the outer loop should be left
+ DynamicToken token;
+ while (!abort && tokenizer.peek(reader, token)) {
+ // Check whether this backslash just escaped some special or
+ // whitespace character or was the beginning of a command
+ if (token.type == tokenBackslash) {
+ // Check whether this character could be the start of a command
+ char c;
+ reader.consumePeek();
+ reader.peek(c);
+ if (Utils::isIdentifierStart(c)) {
CHECK_ISSUE_DATA();
- reader.consumePeek();
- return State::LINEBREAK;
- default:
- dataHandler.append(c, false);
+ // TODO: Parse a command
+ return State::COMMAND;
+ }
+
+ // This was not a special character, just append the given character
+ // to the data buffer, use the escape character start as start
+ // location and the peek offset as end location
+ dataHandler.append(c, token.location.getStart(),
+ reader.getPeekOffset());
+ } else if (token.type == tokenLineComment) {
+ CHECK_ISSUE_DATA();
+ reader.consumePeek();
+ parseLineComment();
+ } else if (token.type == tokenBlockCommentStart) {
+ CHECK_ISSUE_DATA();
+ reader.consumePeek();
+ parseBlockComment();
+ } else if (token.type == tokenLinebreak) {
+ CHECK_ISSUE_DATA();
+ reader.consumePeek();
+ return State::LINEBREAK;
+ } else if (token.type == TextToken) {
+ dataHandler.append(token.content, token.location.getStart(),
+ token.location.getEnd());
}
// Consume the peeked character if we did not abort, otherwise abort
if (!abort) {
reader.consumePeek();
- } else {
- break;
}
}
// Send out pending output data, otherwise we are at the end of the stream
if (!dataHandler.isEmpty()) {
- data = dataHandler.getData();
+ data = dataHandler.toVariant(reader.getSourceId());
return State::DATA;
}
return State::END;
diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp
index 1a136cd..b2ea378 100644
--- a/src/plugins/plain/PlainFormatStreamReader.hpp
+++ b/src/plugins/plain/PlainFormatStreamReader.hpp
@@ -31,6 +31,8 @@
#include <core/common/Variant.hpp>
+#include "DynamicTokenizer.hpp"
+
namespace ousia {
// Forward declarations
@@ -123,6 +125,11 @@ private:
Logger &logger;
/**
+ * Tokenizer instance used to read individual tokens from the text.
+ */
+ DynamicTokenizer tokenizer;
+
+ /**
* Variant containing the current command name (always is a string variant,
* but additionally contains the correct locatino of the name).
*/
@@ -141,6 +148,31 @@ private:
Variant data;
/**
+ * Id of the backslash token.
+ */
+ TokenTypeId tokenBackslash;
+
+ /**
+ * Id of the linebreak token.
+ */
+ TokenTypeId tokenLinebreak;
+
+ /**
+ * Id of the line comment token.
+ */
+ TokenTypeId tokenLineComment;
+
+ /**
+ * Id of the block comment start token.
+ */
+ TokenTypeId tokenBlockCommentStart;
+
+ /**
+ * If of the block comment end token.
+ */
+ TokenTypeId tokenBlockCommentEnd;
+
+ /**
* Contains the field index of the current command.
*/
size_t fieldIdx;
@@ -153,7 +185,7 @@ private:
/**
* Function used internally to parse a generic comment.
*/
- void parseComment();
+ void parseLineComment();
public:
/**