From 65bbbd778f6e0a3668c859b0e22cced7075a726d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:11 +0100 Subject: Moved DynamicTokenizer and TokenTrie to parser/utils --- test/core/parser/utils/TokenTrieTest.cpp | 92 +++++++ test/core/parser/utils/TokenizerTest.cpp | 415 +++++++++++++++++++++++++++++++ 2 files changed, 507 insertions(+) create mode 100644 test/core/parser/utils/TokenTrieTest.cpp create mode 100644 test/core/parser/utils/TokenizerTest.cpp (limited to 'test/core/parser') diff --git a/test/core/parser/utils/TokenTrieTest.cpp b/test/core/parser/utils/TokenTrieTest.cpp new file mode 100644 index 0000000..aacd6c0 --- /dev/null +++ b/test/core/parser/utils/TokenTrieTest.cpp @@ -0,0 +1,92 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const TokenTypeId t1 = 0; +static const TokenTypeId t2 = 1; +static const TokenTypeId t3 = 2; +static const TokenTypeId t4 = 3; + +TEST(TokenTrie, registerToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_TRUE(tree.registerToken("hello", t4)); + + ASSERT_FALSE(tree.registerToken("", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + ASSERT_FALSE(tree.registerToken("b", t4)); + ASSERT_FALSE(tree.registerToken("hello", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + ASSERT_EQ(t4, tree.hasToken("hello")); + ASSERT_EQ(EmptyToken, tree.hasToken("")); + ASSERT_EQ(EmptyToken, tree.hasToken("abc")); +} + +TEST(TokenTrie, unregisterToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_FALSE(tree.registerToken("b", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(EmptyToken, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); +} +} + diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp new file mode 100644 index 0000000..c1f8785 --- /dev/null +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -0,0 +1,415 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { + +TEST(DynamicTokenizer, tokenRegistration) +{ + DynamicTokenizer tokenizer; + + ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); + + ASSERT_EQ(0U, tokenizer.registerToken("a")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); + ASSERT_EQ("a", tokenizer.getTokenString(0U)); + + ASSERT_EQ(1U, tokenizer.registerToken("b")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); + ASSERT_EQ("b", tokenizer.getTokenString(1U)); + + ASSERT_EQ(2U, tokenizer.registerToken("c")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); + ASSERT_EQ("c", tokenizer.getTokenString(2U)); + + ASSERT_TRUE(tokenizer.unregisterToken(1U)); + ASSERT_FALSE(tokenizer.unregisterToken(1U)); + ASSERT_EQ("", tokenizer.getTokenString(1U)); + + ASSERT_EQ(1U, tokenizer.registerToken("d")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); + ASSERT_EQ("d", tokenizer.getTokenString(1U)); +} + +TEST(DynamicTokenizer, textTokenPreserveWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(36U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenTrimWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenCollapseWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, simpleReadToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ(':', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ('t', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + + char c; + ASSERT_FALSE(reader.peek(c)); + } +} + +TEST(DynamicTokenizer, simplePeekToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(5U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(6U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(11U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } +} + +TEST(DynamicTokenizer, ambiguousTokens) +{ + CharReader reader{"abc"}; + DynamicTokenizer tokenizer; + + TokenTypeId t1 = tokenizer.registerToken("abd"); + TokenTypeId t2 = tokenizer.registerToken("bc"); + + ASSERT_EQ(0U, t1); + ASSERT_EQ(1U, t2); + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("a", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(t2, token.type); + ASSERT_EQ("bc", token.content); + + loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(3U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); +} + +TEST(DynamicTokenizer, commentTestWhitespacePreserve) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test ", SourceLocation{0, 5, 10}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test", SourceLocation{0, 5, 9}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +} + -- cgit v1.2.3