/*
Ousía
Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file WhitespaceHandler.hpp
*
* Contains the WhitespaceHandler classes which are used in multiple places to
* trim, compact or preserve whitespaces while at the same time maintaining the
* position information associated with the input strings.
*
* @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_WHITESPACE_HANDLER_HPP_
#define _OUSIA_WHITESPACE_HANDLER_HPP_
#include
#include
#include "Utils.hpp"
namespace ousia {
/**
* WhitespaceHandler is a based class that can be used to collect text on a
* character-by-character basis. Note that this class and its descendants are
* hoped to be inlined by the compiler (and used in conjunction with templates),
* thus they are fully defined inside this header.
*/
class WhitespaceHandler {
public:
/**
* Start position of the extracted text.
*/
size_t textStart;
/**
* End position of the extracted text.
*/
size_t textEnd;
/**
* Buffer containing the extracted text.
*/
std::vector textBuf;
/**
* Constructor of the TextHandlerBase base class. Initializes the start and
* end position with zeros.
*/
WhitespaceHandler() : textStart(0), textEnd(0) {}
/**
* Returns true if this whitespace handler has found any text and a text
* token could be emitted.
*
* @return true if the internal data buffer is non-empty.
*/
bool hasText() { return !textBuf.empty(); }
/**
* Returns the content of the WhitespaceHandler as string.
*/
std::string toString() const
{
return std::string(textBuf.data(), textBuf.size());
}
};
/**
* The PreservingWhitespaceHandler class preserves all characters unmodified,
* including whitepace characters.
*/
class PreservingWhitespaceHandler : public WhitespaceHandler {
public:
/**
* Appends the given character to the internal text buffer, does not
* eliminate whitespace.
*
* @param c is the character that should be appended to the internal buffer.
* @param start is the start byte offset of the given character.
* @param end is the end byte offset of the given character.
*/
void append(char c, size_t start, size_t end)
{
append(c, start, end, textBuf, textStart, textEnd);
}
/**
* Static version of PreservingWhitespaceHandler append
*
* @param c is the character that should be appended to the internal buffer.
* @param start is the start byte offset of the given character.
* @param end is the end byte offset of the given character.
* @param textBuf is a reference at the text buffer that is to be used.
* @param textStart is a reference at the text start variable that is to be
* used.
* @param textEnd is a reference at the text end variable that is to be
* used.
*/
static void append(char c, size_t start, size_t end,
std::vector &textBuf, size_t &textStart,
size_t &textEnd)
{
if (textBuf.empty()) {
textStart = start;
}
textEnd = end;
textBuf.push_back(c);
}
};
/**
* The TrimmingTextHandler class trims all whitespace characters at the begin
* and the end of a text section but leaves all other characters unmodified,
* including whitepace characters.
*/
class TrimmingWhitespaceHandler : public WhitespaceHandler {
public:
/**
* Buffer used internally to temporarily store all whitespace characters.
* They are only added to the output buffer if another non-whitespace
* character is reached.
*/
std::vector whitespaceBuf;
/**
* Appends the given character to the internal text buffer, eliminates
* whitespace characters at the begin and end of the text.
*
* @param c is the character that should be appended to the internal buffer.
* @param start is the start byte offset of the given character.
* @param end is the end byte offset of the given character.
*/
void append(char c, size_t start, size_t end)
{
append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf);
}
/**
* Static version of TrimmingWhitespaceHandler append
*
* @param c is the character that should be appended to the internal buffer.
* @param start is the start byte offset of the given character.
* @param end is the end byte offset of the given character.
* @param textBuf is a reference at the text buffer that is to be used.
* @param textStart is a reference at the text start variable that is to be
* used.
* @param textEnd is a reference at the text end variable that is to be
* used.
* @param whitespaceBuf is a reference at the buffer for storing whitespace
* characters.
*/
static void append(char c, size_t start, size_t end,
std::vector &textBuf, size_t &textStart,
size_t &textEnd, std::vector &whitespaceBuf)
{
// Handle whitespace characters
if (Utils::isWhitespace(c)) {
if (!textBuf.empty()) {
whitespaceBuf.push_back(c);
}
return;
}
// Set the start and end offset correctly
if (textBuf.empty()) {
textStart = start;
}
textEnd = end;
// Store the character
if (!whitespaceBuf.empty()) {
textBuf.insert(textBuf.end(), whitespaceBuf.begin(),
whitespaceBuf.end());
whitespaceBuf.clear();
}
textBuf.push_back(c);
}
};
/**
* The CollapsingTextHandler trims characters at the beginning and end of the
* text and reduced multiple whitespace characters to a single blank.
*/
class CollapsingWhitespaceHandler : public WhitespaceHandler {
public:
/**
* Flag set to true if a whitespace character was reached.
*/
bool hasWhitespace = false;
/**
* Appends the given character to the internal text buffer, eliminates
* redundant whitespace characters.
*
* @param c is the character that should be appended to the internal buffer.
* @param start is the start byte offset of the given character.
* @param end is the end byte offset of the given character.
*/
void append(char c, size_t start, size_t end)
{
append(c, start, end, textBuf, textStart, textEnd, hasWhitespace);
}
/**
* Static version of CollapsingWhitespaceHandler append
*
* @param c is the character that should be appended to the internal buffer.
* @param start is the start byte offset of the given character.
* @param end is the end byte offset of the given character.
* @param textBuf is a reference at the text buffer that is to be used.
* @param textStart is a reference at the text start variable that is to be
* used.
* @param textEnd is a reference at the text end variable that is to be
* used.
* @param hasWhitespace is a reference at the "hasWhitespace" flag.
*/
static void append(char c, size_t start, size_t end,
std::vector &textBuf, size_t &textStart,
size_t &textEnd, bool &hasWhitespace)
{
// Handle whitespace characters
if (Utils::isWhitespace(c)) {
if (!textBuf.empty()) {
hasWhitespace = true;
}
return;
}
// Set the start and end offset correctly
if (textBuf.empty()) {
textStart = start;
}
textEnd = end;
// Store the character
if (hasWhitespace) {
textBuf.push_back(' ');
hasWhitespace = false;
}
textBuf.push_back(c);
}
};
/**
* Function that can be used to append the given buffer (e.g. a string or a
* vector) to the whitespace handler.
*
* @tparam WhitespaceHandler is one of the WhitespaceHandler classes.
* @tparam Buffer is an iterable type.
* @param handler is the handler to which the characters of the Buffer should be
* appended.
* @param buf is the buffer from which the characters should be read.
* @param start is the start byte offset. Each character is counted as one byte.
*/
template
inline void appendToWhitespaceHandler(WhitespaceHandler &handler, Buffer buf,
size_t start)
{
for (auto elem : buf) {
handler.append(elem, start, start + 1);
start++;
}
}
}
#endif /* _OUSIA_WHITESPACE_HANDLER_HPP_ */