// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/strings/utf_offset_string_conversions.h" #include #include #include #include "base/logging.h" #include "base/strings/string_piece.h" #include "base/strings/utf_string_conversion_utils.h" namespace base { OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, size_t original_length, size_t output_length) : original_offset(original_offset), original_length(original_length), output_length(output_length) { } // static void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, std::vector* offsets_for_adjustment, size_t limit) { DCHECK(offsets_for_adjustment); for (auto& i : *offsets_for_adjustment) AdjustOffset(adjustments, &i, limit); } // static void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, size_t* offset, size_t limit) { DCHECK(offset); if (*offset == string16::npos) return; int adjustment = 0; for (const auto& i : adjustments) { if (*offset <= i.original_offset) break; if (*offset < (i.original_offset + i.original_length)) { *offset = string16::npos; return; } adjustment += static_cast(i.original_length - i.output_length); } *offset -= adjustment; if (*offset > limit) *offset = string16::npos; } // static void OffsetAdjuster::UnadjustOffsets( const Adjustments& adjustments, std::vector* offsets_for_unadjustment) { if (!offsets_for_unadjustment || adjustments.empty()) return; for (auto& i : *offsets_for_unadjustment) UnadjustOffset(adjustments, &i); } // static void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, size_t* offset) { if (*offset == string16::npos) return; int adjustment = 0; for (const auto& i : adjustments) { if (*offset + adjustment <= i.original_offset) break; adjustment += static_cast(i.original_length - i.output_length); if ((*offset + adjustment) < (i.original_offset + i.original_length)) { *offset = string16::npos; return; } } *offset += adjustment; } // static void OffsetAdjuster::MergeSequentialAdjustments( const Adjustments& first_adjustments, Adjustments* adjustments_on_adjusted_string) { auto adjusted_iter = adjustments_on_adjusted_string->begin(); auto first_iter = first_adjustments.begin(); // Simultaneously iterate over all |adjustments_on_adjusted_string| and // |first_adjustments|, pushing adjustments at the end of // |adjustments_builder| as we go. |shift| keeps track of the current number // of characters collapsed by |first_adjustments| up to this point. // |currently_collapsing| keeps track of the number of characters collapsed by // |first_adjustments| into the current |adjusted_iter|'s length. These are // characters that will change |shift| as soon as we're done processing the // current |adjusted_iter|; they are not yet reflected in |shift|. size_t shift = 0; size_t currently_collapsing = 0; // While we *could* update |adjustments_on_adjusted_string| in place by // inserting new adjustments into the middle, we would be repeatedly calling // |std::vector::insert|. That would cost O(n) time per insert, relative to // distance from end of the string. By instead allocating // |adjustments_builder| and calling |std::vector::push_back|, we only pay // amortized constant time per push. We are trading space for time. Adjustments adjustments_builder; while (adjusted_iter != adjustments_on_adjusted_string->end()) { if ((first_iter == first_adjustments.end()) || ((adjusted_iter->original_offset + shift + adjusted_iter->original_length) <= first_iter->original_offset)) { // Entire |adjusted_iter| (accounting for its shift and including its // whole original length) comes before |first_iter|. // // Correct the offset at |adjusted_iter| and move onto the next // adjustment that needs revising. adjusted_iter->original_offset += shift; shift += currently_collapsing; currently_collapsing = 0; adjustments_builder.push_back(*adjusted_iter); ++adjusted_iter; } else if ((adjusted_iter->original_offset + shift) > first_iter->original_offset) { // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). // It's not possible for the adjustments to overlap. (It shouldn't // be possible that we have an |adjusted_iter->original_offset| that, // when adjusted by the computed |shift|, is in the middle of // |first_iter|'s output's length. After all, that would mean the // current adjustment_on_adjusted_string somehow points to an offset // that was supposed to have been eliminated by the first set of // adjustments.) DCHECK_LE(first_iter->original_offset + first_iter->output_length, adjusted_iter->original_offset + shift); // Add the |first_iter| to the full set of adjustments. shift += first_iter->original_length - first_iter->output_length; adjustments_builder.push_back(*first_iter); ++first_iter; } else { // The first adjustment adjusted something that then got further adjusted // by the second set of adjustments. In other words, |first_iter| points // to something in the range covered by |adjusted_iter|'s length (after // accounting for |shift|). Precisely, // adjusted_iter->original_offset + shift // <= // first_iter->original_offset // <= // adjusted_iter->original_offset + shift + // adjusted_iter->original_length // Modify the current |adjusted_iter| to include whatever collapsing // happened in |first_iter|, then advance to the next |first_adjustments| // because we dealt with the current one. const int collapse = static_cast(first_iter->original_length) - static_cast(first_iter->output_length); // This function does not know how to deal with a string that expands and // then gets modified, only strings that collapse and then get modified. DCHECK_GT(collapse, 0); adjusted_iter->original_length += collapse; currently_collapsing += collapse; ++first_iter; } } DCHECK_EQ(0u, currently_collapsing); if (first_iter != first_adjustments.end()) { // Only first adjustments are left. These do not need to be modified. // (Their offsets are already correct with respect to the original string.) // Append them all. DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); adjustments_builder.insert(adjustments_builder.end(), first_iter, first_adjustments.end()); } *adjustments_on_adjusted_string = std::move(adjustments_builder); } // Converts the given source Unicode character type to the given destination // Unicode character type as a STL string. The given input buffer and size // determine the source, and the given output STL string will be replaced by // the result. If non-NULL, |adjustments| is set to reflect the all the // alterations to the string that are not one-character-to-one-character. // It will always be sorted by increasing offset. template bool ConvertUnicode(const SrcChar* src, size_t src_len, DestStdString* output, OffsetAdjuster::Adjustments* adjustments) { if (adjustments) adjustments->clear(); // ICU requires 32-bit numbers. bool success = true; int32_t src_len32 = static_cast(src_len); for (int32_t i = 0; i < src_len32; i++) { uint32_t code_point; size_t original_i = i; size_t chars_written = 0; if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { chars_written = WriteUnicodeCharacter(code_point, output); } else { chars_written = WriteUnicodeCharacter(0xFFFD, output); success = false; } // Only bother writing an adjustment if this modification changed the // length of this character. // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last // character read, not after it (so that incrementing it in the loop // increment will place it at the right location), so we need to account // for that in determining the amount that was read. if (adjustments && ((i - original_i + 1) != chars_written)) { adjustments->push_back(OffsetAdjuster::Adjustment( original_i, i - original_i + 1, chars_written)); } } return success; } bool UTF8ToUTF16WithAdjustments( const char* src, size_t src_len, string16* output, base::OffsetAdjuster::Adjustments* adjustments) { PrepareForUTF16Or32Output(src, src_len, output); return ConvertUnicode(src, src_len, output, adjustments); } string16 UTF8ToUTF16WithAdjustments( const base::StringPiece& utf8, base::OffsetAdjuster::Adjustments* adjustments) { string16 result; UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); return result; } string16 UTF8ToUTF16AndAdjustOffsets( const base::StringPiece& utf8, std::vector* offsets_for_adjustment) { for (size_t& offset : *offsets_for_adjustment) { if (offset > utf8.length()) offset = string16::npos; } OffsetAdjuster::Adjustments adjustments; string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); return result; } std::string UTF16ToUTF8AndAdjustOffsets( const base::StringPiece16& utf16, std::vector* offsets_for_adjustment) { for (size_t& offset : *offsets_for_adjustment) { if (offset > utf16.length()) offset = string16::npos; } std::string result; PrepareForUTF8Output(utf16.data(), utf16.length(), &result); OffsetAdjuster::Adjustments adjustments; ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); return result; } } // namespace base