diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-05 02:50:55 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-05 02:50:55 +0100 |
commit | 62d336437d1e92ff8356c39efe22dd4a59de71ee (patch) | |
tree | e4a0c533e3711faa05ab8eb3c8247301c43ac385 | |
parent | 68ae3c4fd9db8baef4fea99d91766af5bc210506 (diff) |
Added initial skeleton of the PlainFormatStreamReader class, providing the most basic functionality needed to read data from a Ousia plain text file.
-rw-r--r-- | CMakeLists.txt | 20 | ||||
-rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.cpp | 294 | ||||
-rw-r--r-- | src/plugins/plain/PlainFormatStreamReader.hpp | 187 | ||||
-rw-r--r-- | test/plugins/plain/PlainFormatStreamReaderTest.cpp | 239 |
4 files changed, 740 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 94c89dc..97d2e78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,15 @@ TARGET_LINK_LIBRARIES(ousia_xml ${EXPAT_LIBRARIES} ) +ADD_LIBRARY(ousia_plain + src/plugins/plain/PlainFormatStreamReader +) + +TARGET_LINK_LIBRARIES(ousia_plain + ousia_core +) + + #ADD_LIBRARY(ousia_mozjs # src/plugins/mozjs/MozJsScriptEngine #) @@ -313,6 +322,16 @@ IF(TEST) ousia_filesystem ) + ADD_EXECUTABLE(ousia_test_plain + test/plugins/plain/PlainFormatStreamReaderTest + ) + + TARGET_LINK_LIBRARIES(ousia_test_plain + ${GTEST_LIBRARIES} + ousia_core + ousia_plain + ) + # ADD_EXECUTABLE(ousia_test_mozjs # test/plugins/mozjs/MozJsScriptEngineTest # ) @@ -329,6 +348,7 @@ IF(TEST) ADD_TEST(ousia_test_css ousia_test_css) ADD_TEST(ousia_test_html ousia_test_html) ADD_TEST(ousia_test_xml ousia_test_xml) + ADD_TEST(ousia_test_plain ousia_test_plain) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) ENDIF() diff --git a/src/plugins/plain/PlainFormatStreamReader.cpp b/src/plugins/plain/PlainFormatStreamReader.cpp new file mode 100644 index 0000000..15ca403 --- /dev/null +++ b/src/plugins/plain/PlainFormatStreamReader.cpp @@ -0,0 +1,294 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <sstream> +#include <unordered_set> + +#include <core/common/CharReader.hpp> +#include <core/common/Logger.hpp> +#include <core/common/Utils.hpp> + +#include "PlainFormatStreamReader.hpp" + +namespace ousia { + +/* Internally used types, protected from spilling the exports by a namespace */ + +namespace { +/** + * Enum used to specify the state of the parseBlockComment state machine. + */ +enum class BlockCommentState { DEFAULT, HAS_CURLY_CLOSE, HAS_PERCENT }; + +/** + * Class taking care of recording plain text data found withing the file. + */ +class DataHandler { +private: + /** + * Const reference at the reader, used for reading the current location. + */ + const CharReader &reader; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + const bool preserveWhitespaces; + + /** + * Current source range of the data in the buffer. + */ + SourceLocation location; + + /** + * Current buffer containing all read characters. + */ + std::stringstream buffer; + + /** + * Set to false, once a non-whitespace character was reached. + */ + bool empty; + + /** + * Set to true if a whitespace was found -- these are normalized to a single + * space. + */ + bool hasWhitespace; + +public: + /** + * Constructor of the DataHandler class. + * + * @param reader is the CharReader that should be used for reading the data + * location. + * @param preserveWhitespaces should be set to true if all whitespaces + * should be preserved (for preformated environments). + */ + DataHandler(const CharReader &reader, bool preserveWhitespaces = false) + : reader(reader), + preserveWhitespaces(preserveWhitespaces), + location(reader.getSourceId()), + empty(true), + hasWhitespace(false) + { + } + + /** + * Appends the given character to the internal buffer. + * + * @param c is the character that should be appended. + * @param wasEscaped is set to true if the character was escaped (prepended + * with a backslash), this allows whitespace characters to be explicitly + * included. + */ + void append(char c, bool wasEscaped = false) + { + // Check whether the character is a whitespace + const bool isWhitespace = + !wasEscaped && !preserveWhitespaces && Utils::isWhitespace(c); + + // Trim leading and trailing whitespaces + if (isWhitespace) { + if (!empty) { + hasWhitespace = true; + } + } else { + // Compress whitespaces to a single space + if (hasWhitespace) { + buffer << ' '; + hasWhitespace = false; + } + + // Append the character + buffer << c; + + // Update the "empty" flag and set the start and end offset + if (empty) { + location.setStart(reader.getOffset()); + empty = false; + } + location.setEnd(reader.getPeekOffset()); + } + } + + /** + * Returns true if no non-whitespace character has been found until now. + * + * @return true if the internal buffer is still empty. + */ + bool isEmpty() { return empty; } + + /** + * Returns a variant containg the read data and its location. + * + * @return a variant with a string value containing the read data and the + * location being set to + */ + Variant getData() + { + Variant res = Variant::fromString(buffer.str()); + res.setLocation(location); + return res; + } +}; +} + +PlainFormatStreamReader::PlainFormatStreamReader(CharReader &reader, + Logger &logger) + : reader(reader), logger(logger), fieldIdx(0) +{ +} + +/* Comment handling */ + +void PlainFormatStreamReader::parseBlockComment() +{ + char c; + BlockCommentState state = BlockCommentState::DEFAULT; + while (reader.read(c)) { + switch (state) { + case BlockCommentState::DEFAULT: + if (c == '%') { + state = BlockCommentState::HAS_PERCENT; + } else if (c == '}') { + state = BlockCommentState::HAS_CURLY_CLOSE; + } + break; + case BlockCommentState::HAS_PERCENT: + if (c == '{') { + parseBlockComment(); + } + state = BlockCommentState::DEFAULT; + break; + case BlockCommentState::HAS_CURLY_CLOSE: + if (c == '%') { + return; + } + state = BlockCommentState::DEFAULT; + break; + } + } + + // Issue an error if the file ends while we are in a block comment + logger.error("File ended while being in a block comment", reader); +} + +void PlainFormatStreamReader::parseComment() +{ + char c; + bool first = true; + reader.consumePeek(); + while (reader.read(c)) { + // Continue parsing a block comment if a '{' is found + if (c == '{' && first) { + parseBlockComment(); + return; + } + if (c == '\n') { + return; + } + first = false; + } +} + +/* Top level parse function */ + +static const std::unordered_set<char> EscapeableCharacters{'\\', '<', '>', + '{', '}', '%'}; + +PlainFormatStreamReader::State PlainFormatStreamReader::parse() +{ +// Macro (sorry for that) used for checking whether there is data to issue, and +// if yes, aborting the loop, allowing for a reentry on a later parse call by +// resetting the peek cursor +#define CHECK_ISSUE_DATA() \ + { \ + if (!dataHandler.isEmpty()) { \ + reader.resetPeek(); \ + abort = true; \ + break; \ + } \ + } + + // Data handler + DataHandler dataHandler(reader); + + // Variable set to true if the parser loop should be left + bool abort = false; + + // Happily add characters to the dataHandler and handle escaping until a + // special character is reached. Then go to a specialiced parsing routine + char c; + while (!abort && reader.peek(c)) { + switch (c) { + case '\\': + reader.peek(c); + // Check whether this backslash just escaped some special or + // whitespace character or was the beginning of a command + if (EscapeableCharacters.count(c) == 0 && + !Utils::isWhitespace(c)) { + CHECK_ISSUE_DATA(); + // TODO: Parse command (starting from the backslash) + return State::COMMAND; + } + // A character was escaped, add it to the buffer, with the + // wasEscaped flag set to true + dataHandler.append(c, true); + break; + case '<': + // TODO: Annotations + break; + case '>': + // TODO: Annotations + break; + case '{': + // TODO: Issue start of field + break; + case '}': + // TODO: Issue end of field + case '%': + CHECK_ISSUE_DATA(); + parseComment(); + break; + case '\n': + CHECK_ISSUE_DATA(); + reader.consumePeek(); + return State::LINEBREAK; + default: + dataHandler.append(c, false); + } + + // Consume the peeked character if we did not abort, otherwise abort + if (!abort) { + reader.consumePeek(); + } else { + break; + } + } + + // Send out pending output data, otherwise we are at the end of the stream + if (!dataHandler.isEmpty()) { + data = dataHandler.getData(); + return State::DATA; + } + return State::END; +#undef CHECK_ISSUE_DATA +} +} + diff --git a/src/plugins/plain/PlainFormatStreamReader.hpp b/src/plugins/plain/PlainFormatStreamReader.hpp new file mode 100644 index 0000000..1a136cd --- /dev/null +++ b/src/plugins/plain/PlainFormatStreamReader.hpp @@ -0,0 +1,187 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ +#define _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ + +/** + * @file PlainFormatStreamReader.hpp + * + * Provides classes for low-level classes for reading the plain TeX-esque + * format. The class provided here do not build any model objects and does not + * implement the Parser interfaces. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#include <core/common/Variant.hpp> + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; + +/** + * The PlainFormatStreamReader class provides a low-level reader for the plain + * TeX-esque format. The parser is constructed around a "parse" function, which + * reads data from the underlying CharReader until a new state is reached and + * indicates this state in a return value. The calling code then has to pull + * corresponding data from the stream reader. The reader already handles some + * invalid cases, but recovers from most errors and happily continues parsing. + */ +class PlainFormatStreamReader { +public: + /** + * Enum used to indicate which state the PlainFormatStreamReader class is in + * after calling the "parse" function. + */ + enum class State { + /** + * State returned if a fully featured command has been read. A command + * consists of the command name and its arguments (which optionally + * includes the name). + */ + COMMAND, + + /** + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. + */ + DATA, + + /** + * State returned if a linebreak has been reached (outside of comments). + */ + LINEBREAK, + + /** + * A user-defined entity has been found. The entity sequence is stored + * in the command name. + */ + ENTITY, + + /** + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). + */ + ANNOTATION_START, + + /** + * State returned if an annotation ends. The reader indicates which + * annotation ends. + */ + ANNOTATION_END, + + /** + * State returned if a new field started. The reader assures that the + * current field ends before a new field is started and that the field + * is not started if data has been given outside of a field. The + * field number is set to the current field index. + */ + FIELD_START, + + /** + * State returned if the current field ends. The reader assures that a + * field was actually open. + */ + FIELD_END, + + /** + * The end of the stream has been reached. + */ + END + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Variant containing the current command name (always is a string variant, + * but additionally contains the correct locatino of the name). + */ + Variant commandName; + + /** + * Variant containing the command arguments (always is a map or array + * variant, but additionally contains the source location of the arguments). + */ + Variant commandArguments; + + /** + * Variant containing the data that has been read (always is a string, + * contains the exact location of the data in the source file). + */ + Variant data; + + /** + * Contains the field index of the current command. + */ + size_t fieldIdx; + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseComment(); + +public: + /** + * Constructor of the PlainFormatStreamReader class. Attaches the new + * PlainFormatStreamReader to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + PlainFormatStreamReader(CharReader &reader, Logger &logger); + + /** + * Continues parsing. Returns one of the states defined in the State enum. + * Callers should stop once the State::END state is reached. Use the getter + * functions to get more information about the current state, such as the + * command name or the data or the current field index. + * + * @return the new state the parser has reached. + */ + State parse(); + + /** + * Returns a reference at the internally stored data. + */ + const Variant& getData() {return data;} +}; +} + +#endif /* _OUSIA_PLAIN_FORMAT_STREAM_READER_HPP_ */ + diff --git a/test/plugins/plain/PlainFormatStreamReaderTest.cpp b/test/plugins/plain/PlainFormatStreamReaderTest.cpp new file mode 100644 index 0000000..c44d575 --- /dev/null +++ b/test/plugins/plain/PlainFormatStreamReaderTest.cpp @@ -0,0 +1,239 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <gtest/gtest.h> + +#include <iostream> + +#include <core/common/CharReader.hpp> +#include <core/frontend/TerminalLogger.hpp> +#include <plugins/plain/PlainFormatStreamReader.hpp> + +namespace ousia { + +static TerminalLogger logger(std::cerr, true); + +TEST(PlainFormatStreamReader, empty) +{ + const char *testString = ""; + CharReader charReader(testString); + + PlainFormatStreamReader reader(charReader, logger); + + ASSERT_EQ(PlainFormatStreamReader::State::END, reader.parse()); +} + +TEST(PlainFormatStreamReader, oneCharacter) +{ + const char *testString = "a"; + CharReader charReader(testString); + + PlainFormatStreamReader reader(charReader, logger); + + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); +} + +TEST(PlainFormatStreamReader, whitespaceElimination) +{ + const char *testString = " hello \t world "; + // 0123456 78901234 + // 0 1 + CharReader charReader(testString); + + PlainFormatStreamReader reader(charReader, logger); + + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("hello world", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(14U, loc.getEnd()); +} + +TEST(PlainFormatStreamReader, whitespaceEliminationWithLinebreak) +{ + const char *testString = " hello \n world "; + // 0123456 78901234 + // 0 1 + CharReader charReader(testString); + + PlainFormatStreamReader reader(charReader, logger); + + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + { + ASSERT_EQ("hello", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + } + ASSERT_EQ(PlainFormatStreamReader::State::LINEBREAK, reader.parse()); + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + { + ASSERT_EQ("world", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(9U, loc.getStart()); + ASSERT_EQ(14U, loc.getEnd()); + } +} + +TEST(PlainFormatStreamReader, escapeWhitespace) +{ + const char *testString = " hello \n\\ world "; + // 0123456 7 89012345 + // 0 1 + CharReader charReader(testString); + + PlainFormatStreamReader reader(charReader, logger); + + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + { + ASSERT_EQ("hello", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + } + ASSERT_EQ(PlainFormatStreamReader::State::LINEBREAK, reader.parse()); + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + { + ASSERT_EQ(" world", reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(8U, loc.getStart()); + ASSERT_EQ(15U, loc.getEnd()); + } +} + +static void testEscapeSpecialCharacter(const std::string &c) +{ + CharReader charReader(std::string("\\") + c); + PlainFormatStreamReader reader(charReader, logger); + EXPECT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + EXPECT_EQ(c, reader.getData().asString()); + + SourceLocation loc = reader.getData().getLocation(); + EXPECT_EQ(0U, loc.getStart()); + EXPECT_EQ(1U + c.size(), loc.getEnd()); +} + +TEST(PlainFormatStreamReader, escapeSpecialCharacters) +{ + testEscapeSpecialCharacter("\\"); + testEscapeSpecialCharacter("{"); + testEscapeSpecialCharacter("}"); + testEscapeSpecialCharacter("<"); + testEscapeSpecialCharacter(">"); +} + +TEST(PlainFormatStreamReader, simpleSingleLineComment) +{ + const char *testString = "% This is a single line comment"; + CharReader charReader(testString); + PlainFormatStreamReader reader(charReader, logger); + ASSERT_EQ(PlainFormatStreamReader::State::END, reader.parse()); +} + +TEST(PlainFormatStreamReader, singleLineComment) +{ + const char *testString = "a% This is a single line comment\nb"; + // 01234567890123456789012345678901 23 + // 0 1 2 3 + CharReader charReader(testString); + PlainFormatStreamReader reader(charReader, logger); + { + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + } + + { + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("b", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(33U, loc.getStart()); + ASSERT_EQ(34U, loc.getEnd()); + } + + ASSERT_EQ(PlainFormatStreamReader::State::END, reader.parse()); +} + +TEST(PlainFormatStreamReader, multilineComment) +{ + const char *testString = "a%{ This is a\n\n multiline line comment}%b"; + // 0123456789012 3 456789012345678901234567890 + // 0 1 2 3 4 + CharReader charReader(testString); + PlainFormatStreamReader reader(charReader, logger); + { + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + } + + { + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("b", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(40U, loc.getStart()); + ASSERT_EQ(41U, loc.getEnd()); + } + + ASSERT_EQ(PlainFormatStreamReader::State::END, reader.parse()); +} + +TEST(PlainFormatStreamReader, nestedMultilineComment) +{ + const char *testString = "a%{%{Another\n\n}%multiline line comment}%b"; + // 0123456789012 3 456789012345678901234567890 + // 0 1 2 3 4 + CharReader charReader(testString); + PlainFormatStreamReader reader(charReader, logger); + { + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("a", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + } + + { + ASSERT_EQ(PlainFormatStreamReader::State::DATA, reader.parse()); + ASSERT_EQ("b", reader.getData().asString()); + SourceLocation loc = reader.getData().getLocation(); + ASSERT_EQ(40U, loc.getStart()); + ASSERT_EQ(41U, loc.getEnd()); + } + + ASSERT_EQ(PlainFormatStreamReader::State::END, reader.parse()); +} + + + +} + |