/* * * Copyright (c) 2023 Project CHIP Authors * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utf8.h" namespace chip { namespace Utf8 { namespace { /** State machine for UTF8 valid bytes Table 3-7. Well-Formed UTF-8 Byte Sequences Code Points | First B | Second B | Third B | Fourth B ------------------+----------+------------+---------+--------- U+0000..U+007F | 00..7F | | | U+0080..U+07FF | C2..DF | 80..BF | | U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | U+D000..U+D7FF | ED | 80..9F (B) | 80..BF | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF */ enum class ParserState { kFirstByte, kSecondByte_A, kSecondByte_B, kSecondByte_C, kSecondByte_D, kExtraOneByte, // 0x80 .. 0xBF once kExtraTwoBytes, // 0x80 .. 0xBF twice kExtraThreeBytes, // 0x80 .. 0xBF three times // kInvalid, // some error }; ParserState NextState(ParserState state, uint8_t value) { switch (state) { case ParserState::kFirstByte: if (value <= 0x7F) { return ParserState::kFirstByte; } else if ((value >= 0xC2) && (value <= 0xDF)) { return ParserState::kExtraOneByte; } else if (value == 0xE0) { return ParserState::kSecondByte_A; } else if ((value >= 0xE1) && (value <= 0xEC)) { return ParserState::kExtraTwoBytes; } else if (value == 0xED) { return ParserState::kSecondByte_B; } else if ((value >= 0xEE) && (value <= 0xEF)) { return ParserState::kExtraTwoBytes; } else if (value == 0xF0) { return ParserState::kSecondByte_C; } else if ((value >= 0xF1) && (value <= 0xF3)) { return ParserState::kExtraThreeBytes; } else if (value == 0xF4) { return ParserState::kSecondByte_D; } else { return ParserState::kInvalid; } case ParserState::kSecondByte_A: if (value >= 0xA0 && value <= 0xBF) { return ParserState::kExtraOneByte; } else { return ParserState::kInvalid; } case ParserState::kSecondByte_B: if (value >= 0x80 && value <= 0x9F) { return ParserState::kExtraOneByte; } else { return ParserState::kInvalid; } case ParserState::kSecondByte_C: if (value >= 0x90 && value <= 0xBF) { return ParserState::kExtraTwoBytes; } else { return ParserState::kInvalid; } case ParserState::kSecondByte_D: if (value >= 0x80 && value <= 0x8F) { return ParserState::kExtraTwoBytes; } else { return ParserState::kInvalid; } case ParserState::kExtraOneByte: if (value >= 0x80 && value <= 0xBF) { return ParserState::kFirstByte; } else { return ParserState::kInvalid; } case ParserState::kExtraTwoBytes: if (value >= 0x80 && value <= 0xBF) { return ParserState::kExtraOneByte; } else { return ParserState::kInvalid; } case ParserState::kExtraThreeBytes: if (value >= 0x80 && value <= 0xBF) { return ParserState::kExtraTwoBytes; } else { return ParserState::kInvalid; } default: return ParserState::kInvalid; } } } // namespace bool IsValid(CharSpan span) { ParserState state = ParserState::kFirstByte; const char * data = span.data(); const size_t kLength = span.size(); // Every byte should be valid for (size_t i = 0; i < kLength; i++) { state = NextState(state, static_cast(data[i])); if (state == ParserState::kInvalid) { return false; } } // finally no continuation should be expected return state == ParserState::kFirstByte; } } // namespace Utf8 } // namespace chip