改编自:http://www.zedwood.com/article/cpp-utf-8-mb_substr-function
这个也有问题,我基础这个改了个完善的另外写了个文字个数统计</p>
uint64_t pinyin::utf8_len(const std::string &str) {
uint64_t i = 0;
uint64_t count =0;
uint64_t c;
for (i = 0; i <str.length(); i++) {
count++;
c = (unsigned char) str[i];
if (c >= 0 && c <= 127) i +=0;
else if ((c & 0xE0) == 0xC0) i += 1;
else if ((c & 0xF0) == 0xE0) i += 2;
else if ((c & 0xF8) == 0xF0) i += 3;
//else if(($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessaryin 4 byte UTF-8
//else if(($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessaryin 4 byte UTF-8
else return 0;//invalid utf8
}
return count;
}
std::string pinyin::utf8_substr(const std::string &str,uint64_t start, uint64_t leng) {
if (leng == 0) { return””; }
uint64_t c, i, ix, q,min = std::string::npos, max = std::string::npos;
for (q = 0, i = 0, ix =str.length(); i < ix; i++, q++) {
if (q == start) { min = i; }
if (q <= start + leng || leng ==std::string::npos) { max = i; }
c = (unsigned char) str[i];
if (c >= 0 && c <= 127) i +=0;
else if ((c & 0xE0) == 0xC0) i += 1;
else if ((c & 0xF0) == 0xE0) i += 2;
else if ((c & 0xF8) == 0xF0) i += 3;
//else if(($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessaryin 4 byte UTF-8
//else if(($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessaryin 4 byte UTF-8
else return “”;//invalid utf8
}
if (q <= start + leng|| leng == std::string::npos) { max = i; }
if (min ==std::string::npos || max == std::string::npos) { return “”; }
return str.substr(min,max – min);
}