/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include "TokenizedDataTestUtils.hpp"
namespace ousia {
static void assertPrimaryToken(CharReader &reader, Tokenizer &tokenizer,
TokenId id, const std::string &text,
SourceOffset start = InvalidSourceOffset,
SourceOffset end = InvalidSourceOffset,
SourceId sourceId = InvalidSourceId)
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
EXPECT_EQ(id, token.id);
EXPECT_EQ(text, token.content);
if (start != InvalidSourceOffset) {
EXPECT_EQ(start, token.getLocation().getStart());
}
if (end != InvalidSourceOffset) {
EXPECT_EQ(end, token.getLocation().getEnd());
}
EXPECT_EQ(sourceId, token.getLocation().getSourceId());
}
static void expectData(const std::string &expected, SourceOffset tokenStart,
SourceOffset tokenEnd, SourceOffset textStart,
SourceOffset textEnd, const Token &token,
TokenizedData &data,
WhitespaceMode mode = WhitespaceMode::PRESERVE)
{
ASSERT_EQ(Tokens::Data, token.id);
Token textToken;
TokenizedDataReader reader = data.reader();
ASSERT_TRUE(reader.read(textToken, TokenSet{}, mode));
EXPECT_EQ(expected, textToken.content);
EXPECT_EQ(tokenStart, token.location.getStart());
EXPECT_EQ(tokenEnd, token.location.getEnd());
EXPECT_EQ(textStart, textToken.getLocation().getStart());
EXPECT_EQ(textEnd, textToken.getLocation().getEnd());
EXPECT_TRUE(reader.atEnd());
}
static void assertDataToken(CharReader &reader, Tokenizer &tokenizer,
const std::string &expected,
SourceOffset tokenStart, SourceOffset tokenEnd,
SourceOffset textStart, SourceOffset textEnd,
WhitespaceMode mode = WhitespaceMode::PRESERVE)
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData(expected, tokenStart, tokenEnd, textStart, textEnd, token, data,
mode);
}
TEST(Tokenizer, tokenRegistration)
{
Tokenizer tokenizer;
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken(""));
ASSERT_EQ(0U, tokenizer.registerToken("a"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("a"));
ASSERT_EQ("a", tokenizer.lookupToken(0U).string);
ASSERT_EQ(1U, tokenizer.registerToken("b"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("b"));
ASSERT_EQ("b", tokenizer.lookupToken(1U).string);
ASSERT_EQ(2U, tokenizer.registerToken("c"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("c"));
ASSERT_EQ("c", tokenizer.lookupToken(2U).string);
ASSERT_TRUE(tokenizer.unregisterToken(1U));
ASSERT_FALSE(tokenizer.unregisterToken(1U));
ASSERT_EQ("", tokenizer.lookupToken(1U).string);
ASSERT_EQ(1U, tokenizer.registerToken("d"));
ASSERT_EQ(Tokens::Empty, tokenizer.registerToken("d"));
ASSERT_EQ("d", tokenizer.lookupToken(1U).string);
}
TEST(Tokenizer, textTokenPreserveWhitespace)
{
{
CharReader reader{" this \t is only a \n\n test text "};
// 012345 6789012345678 9 0123456789012345
// 0 1 2 3
Tokenizer tokenizer;
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData(" this \t is only a \n\n test text ", 0, 36, 0, 36,
token, data, WhitespaceMode::PRESERVE);
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
{
CharReader reader{"this \t is only a \n\n test text"};
// 01234 5678901234567 8 9012345678901
// 0 1 2 3
Tokenizer tokenizer;
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("this \t is only a \n\n test text", 0, 32, 0, 32, token,
data, WhitespaceMode::PRESERVE);
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
TEST(Tokenizer, textTokenTrimWhitespace)
{
{
CharReader reader{" this \t is only a \n\n test text "};
// 012345 6789012345678 9 0123456789012345
// 0 1 2 3
Tokenizer tokenizer;
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("this \t is only a \n\n test text", 0, 36, 1, 33, token,
data, WhitespaceMode::TRIM);
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
{
CharReader reader{"this \t is only a \n\n test text"};
// 01234 5678901234567 8 9012345678901
// 0 1 2 3
Tokenizer tokenizer;
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("this \t is only a \n\n test text", 0, 32, 0, 32, token,
data, WhitespaceMode::TRIM);
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
TEST(Tokenizer, textTokenCollapseWhitespace)
{
{
CharReader reader{" this \t is only a \n\n test text "};
// 012345 6789012345678 9 0123456789012345
// 0 1 2 3
Tokenizer tokenizer;
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("this is only a test text", 0, 36, 1, 33, token, data,
WhitespaceMode::COLLAPSE);
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
{
CharReader reader{"this \t is only a \n\n test text"};
// 01234 5678901234567 8 9012345678901
// 0 1 2 3
Tokenizer tokenizer;
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("this is only a test text", 0, 32, 0, 32, token, data,
WhitespaceMode::COLLAPSE);
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
TEST(Tokenizer, simpleReadToken)
{
CharReader reader{"test1:test2"};
Tokenizer tokenizer;
const TokenId tid = tokenizer.registerToken(":");
ASSERT_EQ(0U, tid);
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
expectData("test1", 0, 5, 0, 5, token, data);
char c;
ASSERT_TRUE(reader.peek(c));
ASSERT_EQ(':', c);
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(tid, token.id);
ASSERT_EQ(":", token.content);
SourceLocation loc = token.location;
ASSERT_EQ(5U, loc.getStart());
ASSERT_EQ(6U, loc.getEnd());
char c;
ASSERT_TRUE(reader.peek(c));
ASSERT_EQ('t', c);
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("test2", 6, 11, 6, 11, token, data);
char c;
ASSERT_FALSE(reader.peek(c));
}
}
TEST(Tokenizer, simplePeekToken)
{
CharReader reader{"test1:test2"};
Tokenizer tokenizer;
const TokenId tid = tokenizer.registerToken(":");
ASSERT_EQ(0U, tid);
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.peek(reader, token, data));
expectData("test1", 0, 5, 0, 5, token, data);
ASSERT_EQ(0U, reader.getOffset());
ASSERT_EQ(5U, reader.getPeekOffset());
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.peek(reader, token, data));
ASSERT_EQ(tid, token.id);
ASSERT_EQ(":", token.content);
SourceLocation loc = token.location;
ASSERT_EQ(5U, loc.getStart());
ASSERT_EQ(6U, loc.getEnd());
ASSERT_EQ(0U, reader.getOffset());
ASSERT_EQ(6U, reader.getPeekOffset());
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.peek(reader, token, data));
expectData("test2", 6, 11, 6, 11, token, data);
ASSERT_EQ(0U, reader.getOffset());
ASSERT_EQ(11U, reader.getPeekOffset());
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("test1", 0, 5, 0, 5, token, data);
ASSERT_EQ(5U, reader.getOffset());
ASSERT_EQ(5U, reader.getPeekOffset());
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(tid, token.id);
ASSERT_EQ(":", token.content);
SourceLocation loc = token.location;
ASSERT_EQ(5U, loc.getStart());
ASSERT_EQ(6U, loc.getEnd());
ASSERT_EQ(6U, reader.getOffset());
ASSERT_EQ(6U, reader.getPeekOffset());
}
{
Token token;
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("test2", 6, 11, 6, 11, token, data);
ASSERT_EQ(11U, reader.getOffset());
ASSERT_EQ(11U, reader.getPeekOffset());
}
}
TEST(Tokenizer, ambiguousTokens)
{
CharReader reader{"abc"};
Tokenizer tokenizer;
TokenizedData data;
TokenId t1 = tokenizer.registerToken("abd");
TokenId t2 = tokenizer.registerToken("bc");
ASSERT_EQ(0U, t1);
ASSERT_EQ(1U, t2);
Token token;
data.clear();
ASSERT_TRUE(tokenizer.read(reader, token, data));
expectData("a", 0, 1, 0, 1, token, data);
SourceLocation loc = token.location;
ASSERT_EQ(0U, loc.getStart());
ASSERT_EQ(1U, loc.getEnd());
data.clear();
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(t2, token.id);
ASSERT_EQ("bc", token.content);
loc = token.location;
ASSERT_EQ(1U, loc.getStart());
ASSERT_EQ(3U, loc.getEnd());
data.clear();
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
TEST(Tokenizer, commentTestWhitespacePreserve)
{
CharReader reader{"Test/Test /* Block Comment */", 0};
// 012345678901234567890123456789
// 0 1 2
Tokenizer tokenizer;
const TokenId t1 = tokenizer.registerToken("/");
const TokenId t2 = tokenizer.registerToken("/*");
const TokenId t3 = tokenizer.registerToken("*/");
std::vector expected = {
{Tokens::Data, "Test", SourceLocation{0, 0, 4}},
{t1, "/", SourceLocation{0, 4, 5}},
{Tokens::Data, "Test ", SourceLocation{0, 5, 10}},
{t2, "/*", SourceLocation{0, 10, 12}},
{Tokens::Data, " Block Comment ", SourceLocation{0, 12, 27}},
{t3, "*/", SourceLocation{0, 27, 29}}};
Token t;
for (auto &te : expected) {
TokenizedData data(0);
EXPECT_TRUE(tokenizer.read(reader, t, data));
EXPECT_EQ(te.id, t.id);
if (te.id != Tokens::Data) {
EXPECT_EQ(te.content, t.content);
} else {
TokenizedDataReader dataReader = data.reader();
Token textToken;
ASSERT_TRUE(dataReader.read(textToken, TokenSet{},
WhitespaceMode::PRESERVE));
EXPECT_TRUE(dataReader.atEnd());
EXPECT_EQ(te.content, textToken.content);
}
EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId());
EXPECT_EQ(te.location.getStart(), t.location.getStart());
EXPECT_EQ(te.location.getEnd(), t.location.getEnd());
}
TokenizedData data;
ASSERT_FALSE(tokenizer.read(reader, t, data));
}
TEST(Tokenizer, nonPrimaryTokens)
{
CharReader reader{
"<>"};
// 012345678901234567890 12345678901234567890123456789012345678901234567
// 0 1 2 3 4 5 6
Tokenizer tokenizer;
TokenId tBackslash = tokenizer.registerToken("\\");
TokenId tDollar = tokenizer.registerToken("$", false);
TokenId tSpeechStart = tokenizer.registerToken("<<", false);
TokenId tSpeechEnd = tokenizer.registerToken(">>", false);
TokenSet tokens = TokenSet{tDollar, tSpeechStart, tSpeechEnd};
Token token, textToken;
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
TokenizedDataReader dataReader = data.reader();
assertToken(dataReader, tSpeechStart, "<<", tokens,
WhitespaceMode::TRIM, 0, 2);
assertText(dataReader, "switch to", tokens, WhitespaceMode::TRIM, 2,
11);
assertToken(dataReader, tDollar, "$", tokens, WhitespaceMode::TRIM, 12,
13);
assertText(dataReader, "inline", tokens, WhitespaceMode::TRIM, 13, 19);
assertEnd(dataReader);
}
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(tBackslash, token.id);
ASSERT_EQ(20U, token.location.getStart());
ASSERT_EQ(21U, token.location.getEnd());
}
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
TokenizedDataReader dataReader = data.reader();
assertText(dataReader, "math mode", tokens, WhitespaceMode::TRIM, 21,
30);
assertToken(dataReader, tDollar, "$", tokens, WhitespaceMode::TRIM, 30,
31);
assertText(dataReader, "they said, see the world they said", tokens,
WhitespaceMode::TRIM, 32, 66);
assertToken(dataReader, tSpeechEnd, ">>", tokens, WhitespaceMode::TRIM,
66, 68);
assertEnd(dataReader);
}
TokenizedData data;
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
TEST(Tokenizer, primaryNonPrimaryTokenInteraction)
{
CharReader reader{"<><<<>>"};
// 01234567890123456789012 3456789012345
// 0 1 2 3
Tokenizer tokenizer;
TokenId tP1 = tokenizer.registerToken("<", true);
TokenId tP2 = tokenizer.registerToken(">", true);
TokenId tP3 = tokenizer.registerToken("\\>", true);
TokenId tN1 = tokenizer.registerToken("<<", false);
TokenId tN2 = tokenizer.registerToken(">>", false);
TokenSet tokens = TokenSet{tN1, tN2};
Token token, textToken;
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
TokenizedDataReader dataReader = data.reader();
assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 0, 2);
assertText(dataReader, "test1", tokens, WhitespaceMode::TRIM, 2, 7);
assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 7, 9);
assertEnd(dataReader);
}
assertPrimaryToken(reader, tokenizer, tP1, "<", 9, 10);
assertDataToken(reader, tokenizer, "test2", 10, 15, 10, 15);
assertPrimaryToken(reader, tokenizer, tP2, ">", 15, 16);
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
TokenizedDataReader dataReader = data.reader();
assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 16, 18);
assertText(dataReader, "test3", tokens, WhitespaceMode::TRIM, 18, 23);
assertEnd(dataReader);
}
assertPrimaryToken(reader, tokenizer, tP3, "\\>", 23, 25);
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
TokenizedDataReader dataReader = data.reader();
assertToken(dataReader, tN1, "<<", tokens, WhitespaceMode::TRIM, 25, 27);
assertEnd(dataReader);
}
assertPrimaryToken(reader, tokenizer, tP1, "<", 27, 28);
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ(Tokens::Data, token.id);
TokenizedDataReader dataReader = data.reader();
assertText(dataReader, "test4", tokens, WhitespaceMode::TRIM, 28, 33);
assertToken(dataReader, tN2, ">>", tokens, WhitespaceMode::TRIM, 33, 35);
assertEnd(dataReader);
}
assertPrimaryToken(reader, tokenizer, tP2, ">", 35, 36);
TokenizedData data;
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
TEST(Tokenizer, ambiguousTokens2)
{
CharReader reader{"<\\"};
Tokenizer tokenizer;
TokenId tBackslash = tokenizer.registerToken("\\");
TokenId tAnnotationStart = tokenizer.registerToken("<\\");
TokenSet tokens = TokenSet{tBackslash, tAnnotationStart};
Token token;
{
TokenizedData data;
ASSERT_TRUE(tokenizer.read(reader, token, data));
ASSERT_EQ("<\\", token.content);
ASSERT_EQ(tAnnotationStart, token.id);
ASSERT_TRUE(data.empty());
}
{
TokenizedData data;
ASSERT_FALSE(tokenizer.read(reader, token, data));
}
}
}