// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/i18n/break_iterator.h" #include #include "base/logging.h" #include "third_party/icu/source/common/unicode/ubrk.h" #include "third_party/icu/source/common/unicode/uchar.h" #include "third_party/icu/source/common/unicode/ustring.h" namespace base { namespace i18n { const size_t npos = static_cast(-1); BreakIterator::BreakIterator(const StringPiece16& str, BreakType break_type) : iter_(nullptr), string_(str), break_type_(break_type), prev_(npos), pos_(0) {} BreakIterator::BreakIterator(const StringPiece16& str, const string16& rules) : iter_(nullptr), string_(str), rules_(rules), break_type_(RULE_BASED), prev_(npos), pos_(0) {} BreakIterator::~BreakIterator() { if (iter_) ubrk_close(static_cast(iter_)); } bool BreakIterator::Init() { UErrorCode status = U_ZERO_ERROR; UParseError parse_error; UBreakIteratorType break_type; switch (break_type_) { case BREAK_CHARACTER: break_type = UBRK_CHARACTER; break; case BREAK_WORD: break_type = UBRK_WORD; break; case BREAK_SENTENCE: break_type = UBRK_SENTENCE; break; case BREAK_LINE: case BREAK_NEWLINE: case RULE_BASED: // (Keep compiler happy, break_type not used in this case) break_type = UBRK_LINE; break; default: NOTREACHED() << "invalid break_type_"; return false; } if (break_type_ == RULE_BASED) { iter_ = ubrk_openRules(rules_.c_str(), static_cast(rules_.length()), string_.data(), static_cast(string_.size()), &parse_error, &status); if (U_FAILURE(status)) { NOTREACHED() << "ubrk_openRules failed to parse rule string at line " << parse_error.line << ", offset " << parse_error.offset; } } else { iter_ = ubrk_open(break_type, nullptr, string_.data(), static_cast(string_.size()), &status); if (U_FAILURE(status)) { NOTREACHED() << "ubrk_open failed for type " << break_type << " with error " << status; } } if (U_FAILURE(status)) { return false; } // Move the iterator to the beginning of the string. ubrk_first(static_cast(iter_)); return true; } bool BreakIterator::Advance() { int32_t pos; int32_t status; prev_ = pos_; switch (break_type_) { case BREAK_CHARACTER: case BREAK_WORD: case BREAK_LINE: case BREAK_SENTENCE: case RULE_BASED: pos = ubrk_next(static_cast(iter_)); if (pos == UBRK_DONE) { pos_ = npos; return false; } pos_ = static_cast(pos); return true; case BREAK_NEWLINE: do { pos = ubrk_next(static_cast(iter_)); if (pos == UBRK_DONE) break; pos_ = static_cast(pos); status = ubrk_getRuleStatus(static_cast(iter_)); } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT); if (pos == UBRK_DONE && prev_ == pos_) { pos_ = npos; return false; } return true; default: NOTREACHED() << "invalid break_type_"; return false; } } bool BreakIterator::SetText(const base::char16* text, const size_t length) { UErrorCode status = U_ZERO_ERROR; ubrk_setText(static_cast(iter_), text, length, &status); pos_ = 0; // implicit when ubrk_setText is done prev_ = npos; if (U_FAILURE(status)) { NOTREACHED() << "ubrk_setText failed"; return false; } string_ = StringPiece16(text, length); return true; } bool BreakIterator::IsWord() const { return GetWordBreakStatus() == IS_WORD_BREAK; } BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const { int32_t status = ubrk_getRuleStatus(static_cast(iter_)); if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) return IS_LINE_OR_CHAR_BREAK; // In ICU 60, trying to advance past the end of the text does not change // |status| so that |pos_| has to be checked as well as |status|. // See http://bugs.icu-project.org/trac/ticket/13447 . return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD : IS_WORD_BREAK; } bool BreakIterator::IsEndOfWord(size_t position) const { if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) return false; UBreakIterator* iter = static_cast(iter_); UBool boundary = ubrk_isBoundary(iter, static_cast(position)); int32_t status = ubrk_getRuleStatus(iter); return (!!boundary && status != UBRK_WORD_NONE); } bool BreakIterator::IsStartOfWord(size_t position) const { if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) return false; UBreakIterator* iter = static_cast(iter_); UBool boundary = ubrk_isBoundary(iter, static_cast(position)); ubrk_next(iter); int32_t next_status = ubrk_getRuleStatus(iter); return (!!boundary && next_status != UBRK_WORD_NONE); } bool BreakIterator::IsSentenceBoundary(size_t position) const { if (break_type_ != BREAK_SENTENCE && break_type_ != RULE_BASED) return false; UBreakIterator* iter = static_cast(iter_); return !!ubrk_isBoundary(iter, static_cast(position)); } bool BreakIterator::IsGraphemeBoundary(size_t position) const { if (break_type_ != BREAK_CHARACTER) return false; UBreakIterator* iter = static_cast(iter_); return !!ubrk_isBoundary(iter, static_cast(position)); } string16 BreakIterator::GetString() const { return GetStringPiece().as_string(); } StringPiece16 BreakIterator::GetStringPiece() const { DCHECK(prev_ != npos && pos_ != npos); return string_.substr(prev_, pos_ - prev_); } } // namespace i18n } // namespace base