/*-----------------------Own_lib函数库系列 utf8towcs.cc------------------------
cy_utf8towcs() /utf8towcs.o /libcyfunc.a

描述: 将国际通行用于磁盘存储的8bits字节序列---UTF-8编码的字符串转换为wchar_t宽字
      节字符串不需要查找码表, 它通过简单的码位截取与合成即可完成.
      本函数提供这一实现.
dest_wstr:
      字符串转换到的宽字节序列目标地址.
src_str:
      被转换的源字符串的指针.
max_chars:
      将被转换的最多字符个数限制, 若置之为0, 则转换直到字符串结束符'\0'.
返回值:
      返回实际转换的字符数. 若遇到错误或检测到非法字节序列, 则返回-1.

注意! 1. 传递的字符串必须是合法的UTF-8编码序列.
      2. 除0外, 如果返回值等于max_chars, 则转换后的款字符串不是以L'\0'结尾的.

作者: 任逍 |2002.05.26.
版权: GNU General (Library) Public License (GPL/LGPL)

* 编辑器: vim-6.0 |操作系统: TurboLinux7.0简体中文版 *
------------------------------------------------------------------------------*/

#include "cyinclude/cyutf.h"
#define MAX_CONV_CHARS	65535	// unsigned short int 的最大值

size_t  cy_utf8towcs(wchar_t * dest_wstr,
                     const unsigned char * src_str,
                     size_t max_chars)
{
	int 	count_bytes = 0;
	unsigned char byte_one = 0, byte_other = 0x3f;	// 用于位与运算以提取位值
	long 	test_length = 0;
	size_t	test_chars = 0;
	wchar_t	tmp_wchar = L'\0';

	if ( (!src_str) || (!dest_wstr) )
		return (size_t)-1;

	if (max_chars == 0)
		max_chars = MAX_CONV_CHARS;	// 即为无最大字符数限制

	do	// 此循环可检测到字符串的结束符'\0'并转换之
	{
		if (test_chars >= max_chars)
			break;

		for (;;)	// 检测字节序列长度
		{
			if (src_str[test_length] <= 0x7f){
				count_bytes = 1;	// ASCII字符: 0xxxxxxx( ~ 01111111)
				byte_one = 0x7f;	// 用于位与运算, 提取有效位值, 下同
				break;
			}
			if ( (src_str[test_length] >= 0xc0) && (src_str[test_length] <= 0xdf) ){
				count_bytes = 2;	// 110xxxxx(110 00000 ~ 110 111111) 
				byte_one = 0x1f;
				break;
			}
			if ( (src_str[test_length] >= 0xe0) && (src_str[test_length] <= 0xef) ){
				count_bytes = 3;	// 1110xxxx(1110 0000 ~ 1110 1111)
				byte_one = 0xf;
				break;
			}
			if ( (src_str[test_length] >= 0xf0) && (src_str[test_length] <= 0xf7) ){
				count_bytes = 4;	// 11110xxx(11110 000 ~ 11110 111)
				byte_one = 0x7;
				break;
			}
			if ( (src_str[test_length] >= 0xf8) && (src_str[test_length] <= 0xfb) ){
				count_bytes = 5;	// 111110xx(111110 00 ~ 111110 11)
				byte_one = 0x3;
				break;
			}
			if ( (src_str[test_length] >= 0xfc) && (src_str[test_length] <= 0xfd) ){
				count_bytes = 6;	// 1111110x(1111110 0 ~ 1111110 1)
				byte_one = 0x1;
				break;
			}
			return (size_t)-1;	// 以上皆不满足则为非法序列
		}
		// 以下几行析取UTF-8编码字符各个字节的有效位值
		tmp_wchar = src_str[test_length] & byte_one;
		for (int i=1; i<count_bytes; i++)
		{
			tmp_wchar <<= 6;	// 左移6位后与后续字节的有效位值"位或"赋值
			tmp_wchar = tmp_wchar | (src_str[test_length + i] & byte_other);
		}
		// 位值析取__End!
		dest_wstr[test_chars] = tmp_wchar;

		test_length += count_bytes;
		test_chars ++;

	}while (src_str[test_length] != '\0');

	return test_chars;
}