在我制作工具的多个情景下都会产生某些名字无法使用的问题,例如虚幻的ABC插件不支持中文名,蓝图的变量经常存在空格与特殊字符无法绑定到其他语言等。
为了解决这个问题,我做了一个可以将名字字符限定在a-z A-z 0-9 _之内的编解码方案。
对中文、空格、特殊字符等非可见ASCII字符进行转义,编程语言不支持数字开头的标识符变量,所以也会对开头的数字进行转义。
一个非数字开头,含有大小写数字以及下划线的名字将不会被编码所改变。如 Name_A016。
该编码规范将使用_h作为转义符开头,_结尾,中间的内容是转义结果,虽然一个下划线正常情况下不会被编码,但这里用下划线举例,_编码结果为_h2D_,其中_h为转义符,2D为下划线的ASCII编码,最后的_代表转义结束。
如果原字符串中出现了用于转义的_h,那么这种情况会将下划线转义为_h2D_,而h转义为_h5F_,结果为_h2D__h5F_
如数字开头的字符串,则会在开头添加_h01_,如3DWidget将会被编码为_h01_3DWidget。
转义符用下划线来代表结束也是为了支持中文字符串,因为字符串的长度是不确定的。
如“技能3”,其中的“技能”将会被编码为_hu8062FD80_,在这个编码里_h和往常一样作为转义符出现,而接下来出现的u代表着unicode编码,8062 FD80则代表着技能二字的unicode编码,所以编码结果为_hu8062FD80_3
这里使用UE的代码来实现。
#include "NameEncode.h"
#include <ctype.h>
#define ESCAPE_SYMBOL_FIRST TEXT('_')
#define ESCAPE_SYMBOL_SECOND TEXT('h')
#define ESCAPE_SYMBOL TEXT("_h")
#define ESCAPE_SYMBOL_LEN 2
#define ESCAPE_END_SYMBOL TEXT('_')
#define ESCAPE_HEAD_NUMBER 0x01
#define ESCAPE_HEAD_NUMBER_SYMBOL TEXT("01")
#define ESCAPE_UNICODE_IDENT_LE TEXT('u')
#define ESCAPE_UNICODE_IDENT_BE TEXT('U')
#define ESCAPE_UNICODE_IDENT (IsLittleEndian() ? ESCAPE_UNICODE_IDENT_LE : ESCAPE_UNICODE_IDENT_BE)
static inline bool ShouldEscape(int C)
{
return C <= 0x7F && !(::isalnum(C) || C == '_');
}
static inline bool IsLittleEndian()
{
static struct V
{
V()
{
unsigned short N = 1;
Bool = static_cast<bool>(reinterpret_cast<unsigned char*>(&N)[0]);
}
bool Bool;
} Local;
return Local.Bool;
}
static const char* HexList = "0123456789ABCDEF";
static inline void ByteToHex(uint8_t Value, TCHAR* Out)
{
Out[0] = HexList[Value / 16];
Out[1] = HexList[Value % 16];
}
static uint8 HexToByte(TCHAR Str)
{
return Str >= 'A' ? Str - 'A' + 10 : Str - '0';
}
static uint8 HexToByte(const TCHAR* Str)
{
return HexToByte(Str[0]) * 16 + HexToByte(Str[1]);
}
FString FNameEncode::Encode(const FString& Name, bool bEncodeWideString)
{
constexpr int kNormal = 1;
constexpr int kEscapeChar = 2;
constexpr int kUnicodeString = 3;
FString Ret;
Ret.Reserve(Name.Len() + 4);
const int LastIndex = Name.Len() - 1;
int LastState = kNormal;
for (int i = 0; i < Name.Len(); ++i)
{
const auto C = Name[i];
if(C == ESCAPE_HEAD_NUMBER)
{
UE_LOG(LogTemp, Error, TEXT("string contains illegal characters: %s"), *Name);
return {};
}
bool bEscapeSymbol = C == ESCAPE_SYMBOL_FIRST && i != LastIndex && Name[i + 1] == ESCAPE_SYMBOL_SECOND;
bool bDigitHead = i == 0 && ::isdigit(C);
if (bDigitHead)
{
Ret.AppendChars(ESCAPE_SYMBOL, ESCAPE_SYMBOL_LEN);
Ret.AppendChars(ESCAPE_HEAD_NUMBER_SYMBOL, 2);
Ret.AppendChar(ESCAPE_END_SYMBOL);
Ret.AppendChar(C);
LastState = kEscapeChar;
}
else if (bEscapeSymbol || ShouldEscape(C))
{
Ret.AppendChars(ESCAPE_SYMBOL, ESCAPE_SYMBOL_LEN);
TCHAR Hex[2];
ByteToHex(C, Hex);
Ret.AppendChars(Hex, 2);
if (bEscapeSymbol) //skip escape symbol
{
++i;
}
Ret.AppendChar(ESCAPE_END_SYMBOL);
LastState = kEscapeChar;
}
else if (bEncodeWideString && C > 0x7F)
{
if (LastState != kUnicodeString)
{
Ret.AppendChars(ESCAPE_SYMBOL, ESCAPE_SYMBOL_LEN);
// add endian info
Ret.AppendChar(ESCAPE_UNICODE_IDENT);
}
unsigned short _C = static_cast<unsigned short>(C);
unsigned char* P = reinterpret_cast<unsigned char*>(&_C);
TCHAR Hex[4];
ByteToHex(P[0], Hex);
ByteToHex(P[1], Hex + 2);
Ret.AppendChars(Hex, 4);
const bool bNext = bEncodeWideString && i != LastIndex && Name[i + 1] > 0x7F;
if (bNext)
{
// next
}
else
{
Ret.AppendChar(ESCAPE_END_SYMBOL);
}
LastState = kUnicodeString;
}
else
{
// a-z A-Z 0-9 _
Ret.AppendChar(C);
LastState = kNormal;
}
}
return Ret;
}
FString FNameEncode::Decode(const FString& Name)
{
constexpr int kNormal = 1;
constexpr int kEscapeChar = 2;
constexpr int kUnicodeString = 3;
FString Ret;
Ret.Reserve(Name.Len());
const int LastIndex = Name.Len() - 1;
int State = kNormal;
bool UnicodeStringCtx_LE = true;
bool UnicodeStringCtx_Collected = false;
unsigned char UnicodeStringCtx_CollectedByte{};
for (int i = 0; i < Name.Len(); ++i)
{
const auto C = Name[i];
switch (State)
{
case kNormal:
{
const bool bEscape =
C == ESCAPE_SYMBOL_FIRST &&
i < LastIndex - 1 &&
Name[i + 1] == ESCAPE_SYMBOL_SECOND;
if (bEscape)
{
State = kEscapeChar;
i += 1; // skip escape symbol
break;
}
Ret.AppendChar(C);
}
break;
case kEscapeChar:
{
if (C == ESCAPE_UNICODE_IDENT_LE || C == ESCAPE_UNICODE_IDENT_BE)
{
UnicodeStringCtx_LE = C == ESCAPE_UNICODE_IDENT_LE;
State = kUnicodeString;
break;
}
auto Hex = HexToByte(GetData(Name) + i);
if (i == 2 /* hex start offset */ && Hex == ESCAPE_HEAD_NUMBER)
{
// do nothing.
}
else if (Hex == '_')
{
Ret.AppendChars(ESCAPE_SYMBOL, ESCAPE_SYMBOL_LEN);
}
else
{
Ret.AppendChar(Hex);
}
i += 2; //skip next char and end symbol
State = kNormal;
}
break;
case kUnicodeString:
{
if (i + 1 >= Name.Len())
{
UE_LOG(LogTemp, Error, TEXT("decode failed. %s"), *Name);
return {};
}
unsigned char Byte = HexToByte(GetData(Name) + i);
if (!UnicodeStringCtx_Collected)
{
UnicodeStringCtx_CollectedByte = Byte;
UnicodeStringCtx_Collected = true;
++i;
break;
}
UnicodeStringCtx_Collected = false;
short WideChar;
unsigned char* P = reinterpret_cast<unsigned char*>(&WideChar);
P[0] = UnicodeStringCtx_CollectedByte;
P[1] = Byte;
// endian mismatch
if (IsLittleEndian() && !UnicodeStringCtx_LE)
{
const auto Temp = P[0];
P[0] = P[1];
P[1] = Temp;
}
++i;
// peek next
if (i != LastIndex && Name[i + 1] != ESCAPE_END_SYMBOL)
{
// next
Ret.AppendChar(WideChar);
}
else
{
++i; //skip end symbol
Ret.AppendChar(WideChar);
State = kNormal;
}
}
break;
default: ;
}
}
return Ret;
}