diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-14 23:47:11 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-14 23:47:11 +0100 |
commit | 65bbbd778f6e0a3668c859b0e22cced7075a726d (patch) | |
tree | c7e12a5b3330198deb5ab51eecdd12dd9e03286e /test/formats | |
parent | fec6ac1d65aee3e4e5c948b0f7cbdec7ceb6cb45 (diff) |
Moved DynamicTokenizer and TokenTrie to parser/utils
Diffstat (limited to 'test/formats')
-rw-r--r-- | test/formats/osdm/DynamicTokenizerTest.cpp | 415 | ||||
-rw-r--r-- | test/formats/osdm/TokenTrieTest.cpp | 92 |
2 files changed, 0 insertions, 507 deletions
diff --git a/test/formats/osdm/DynamicTokenizerTest.cpp b/test/formats/osdm/DynamicTokenizerTest.cpp deleted file mode 100644 index c1f8785..0000000 --- a/test/formats/osdm/DynamicTokenizerTest.cpp +++ /dev/null @@ -1,415 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <gtest/gtest.h> - -#include <core/common/CharReader.hpp> -#include <formats/osdm/DynamicTokenizer.hpp> - -namespace ousia { - -TEST(DynamicTokenizer, tokenRegistration) -{ - DynamicTokenizer tokenizer; - - ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); - - ASSERT_EQ(0U, tokenizer.registerToken("a")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); - - ASSERT_EQ(1U, tokenizer.registerToken("b")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); - - ASSERT_EQ(2U, tokenizer.registerToken("c")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); - - ASSERT_TRUE(tokenizer.unregisterToken(1U)); - ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); - - ASSERT_EQ(1U, tokenizer.registerToken("d")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); -} - -TEST(DynamicTokenizer, textTokenPreserveWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenTrimWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenCollapseWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, simpleReadToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ(':', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ('t', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - - char c; - ASSERT_FALSE(reader.peek(c)); - } -} - -TEST(DynamicTokenizer, simplePeekToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(5U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(6U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(11U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } -} - -TEST(DynamicTokenizer, ambiguousTokens) -{ - CharReader reader{"abc"}; - DynamicTokenizer tokenizer; - - TokenTypeId t1 = tokenizer.registerToken("abd"); - TokenTypeId t2 = tokenizer.registerToken("bc"); - - ASSERT_EQ(0U, t1); - ASSERT_EQ(1U, t2); - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("a", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(t2, token.type); - ASSERT_EQ("bc", token.content); - - loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(3U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); -} - -TEST(DynamicTokenizer, commentTestWhitespacePreserve) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector<DynamicToken> expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test ", SourceLocation{0, 5, 10}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(DynamicTokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector<DynamicToken> expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -} - diff --git a/test/formats/osdm/TokenTrieTest.cpp b/test/formats/osdm/TokenTrieTest.cpp deleted file mode 100644 index aacd6c0..0000000 --- a/test/formats/osdm/TokenTrieTest.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <gtest/gtest.h> - -#include <formats/osdm/TokenTrie.hpp> - -namespace ousia { - -static const TokenTypeId t1 = 0; -static const TokenTypeId t2 = 1; -static const TokenTypeId t3 = 2; -static const TokenTypeId t4 = 3; - -TEST(TokenTrie, registerToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_TRUE(tree.registerToken("hello", t4)); - - ASSERT_FALSE(tree.registerToken("", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - ASSERT_FALSE(tree.registerToken("b", t4)); - ASSERT_FALSE(tree.registerToken("hello", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - ASSERT_EQ(t4, tree.hasToken("hello")); - ASSERT_EQ(EmptyToken, tree.hasToken("")); - ASSERT_EQ(EmptyToken, tree.hasToken("abc")); -} - -TEST(TokenTrie, unregisterToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_FALSE(tree.registerToken("b", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("a")); - ASSERT_FALSE(tree.unregisterToken("a")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("b")); - ASSERT_FALSE(tree.unregisterToken("b")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("ab")); - ASSERT_FALSE(tree.unregisterToken("ab")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(EmptyToken, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); -} -} - |