회사 업무용으로 만들었기는 했지만
이정도는 거의 대부분의 모든 사람들이 만들수 있을것 같아서 공개 결정..
다만 C++을 이렇게 하면 편하다 뭐 이런것 정도만 ^^ ㅋ
그너저나 정말 정말 싫어 하는 VC++ 작성 하기 버겁다..
Borland C++ Builder가 더 작업 하기 편한데 쩝..
뭐 대략 이정도...
한문, 한글, 영어, 숫자, 심볼 다섯개 스타일로 구분을 해서 결과가 나오고
숫자는 #00,0 스타일도 인식 가능하다...
테스트 프로그램 결과는 대략..
Sentence [1] : 이병헌
State : KOR
Sentence [2] : 주인공
State : KOR
Sentence [3] : 게임
State : KOR
Sentence [4] : 한국서
State : KOR
Sentence [5] : 나온다
State : KOR
Sentence [6] : [
State : SYM
이런식으로 나온다.. 흠...
한글 형태소 분석기에 쓸건데 17메가 유니코드 파일을 읽고 분류 하는데 대략 5초 정도 걸리는군..
빠른건가 느린건가.. 쩝..
이전에 솔트룩스에서 재직시 작성한 오토마타 기반의 토크나이져
파란 블로그에 올렸었는데 파란 계정을 지우면서 날아 가서
회사 홈피에서 다시 복사해옴^^
#pragma once
#include "MemoryPool.h"
const wchar_t UNICODE_NULL = 0x0000;
const wchar_t UNICODE_EOF = 0xFFFF;
const wchar_t UNICODE_HT = 0x0009;
const wchar_t UNICODE_LF = 0x000A; // \n
const wchar_t UNICODE_CR = 0x000D; // \r
const wchar_t UNICODE_SPACE = 0x0020; // space const
wchar_t UNICODE_EXCLAMATIONMARK = 0x0021; // !
const wchar_t UNICODE_QUOTATIONMARK = 0x0022; // "
const wchar_t UNICODE_NUMBERSIGN = 0x0023; // #
const wchar_t UNICODE_DOLLARSIGN = 0x0024; // $
const wchar_t UNICODE_PERCENTSIGN = 0x0025; // %
const wchar_t UNICODE_AMPERSAND = 0x0026; // &
const wchar_t UNICODE_APOSTROPHE = 0x0027; // '
const wchar_t UNICODE_LEFTPARENTHESIS = 0x0028; // (
const wchar_t UNICODE_RIGHTPARENTHESIS = 0x0029; // )
const wchar_t UNICODE_ASTERISK = 0x002A; // *
const wchar_t UNICODE_PLUS = 0x002B; // +
const wchar_t UNICODE_COMMA = 0x002C; // ,
const wchar_t UNICODE_MINUS = 0x002D; // -
const wchar_t UNICODE_DOT = 0x002E; // .
const wchar_t UNICODE_SLASH = 0x002F; // /
const wchar_t UNICODE_COLON = 0x003A; // :
const wchar_t UNICODE_SEMICOLON = 0x003B; // ;
const wchar_t UNICODE_LESS_THAN_SIGN = 0x003C; // <
const wchar_t UNICODE_EQUALS_SIGN = 0x003D; // =
const wchar_t UNICODE_GREATER_THAN_SIGN = 0x003E; // >
const wchar_t UNICODE_INTERROGATION = 0x003F; // ?
const wchar_t UNICODE_QUESTION_MARK = 0x003F; // ?
const wchar_t UNICODE_COMMERCIAL_AT = 0x0040; // @
const wchar_t UNICODE_LEFT_SQUARE_BRACKET = 0x005B; // [
const wchar_t UNICODE_REVERSE_SOLIDUS = 0x005C; // - \ *-
const wchar_t UNICODE_BACK_SLASH = 0x005C; // - \ *-
const wchar_t UNICODE_RIGHT_SQUARE_BRACKET = 0x005D; // ]
const wchar_t UNICODE_CIRCUMFLEX_ACCENT = 0x005E; // ^
const wchar_t UNICODE_LOWLINE = 0x005F; // _
const wchar_t UNICODE_GRAVE_ACCENT = 0x0060; // `
const wchar_t UNICODE_LEFT_CURLY_BRACKET = 0x007B; // {
const wchar_t UNICODE_VERTICAL_LINE = 0x007C; // |
const wchar_t UNICODE_RIGHT_CURLY_BRACKET = 0x007D; // }
const wchar_t UNICODE_TILDE = 0x007E; // ~
const wchar_t UNICODE_DELETE = 0x007F; //
enum AutomataState
{
enSYM = 0, enKOR, enENG, enCJK, enNUMERIC, enWHITESPACE
};
enum SYMBOLS
{
enEXCLAMATIONMARK = UNICODE_EXCLAMATIONMARK, enQUOTATIONMARK = UNICODE_QUOTATIONMARK, enNUMBERSIGN = UNICODE_NUMBERSIGN, enDOLLARSIGN = UNICODE_DOLLARSIGN, enPERCENTSIGN = UNICODE_PERCENTSIGN,
enAMPERSAND = UNICODE_AMPERSAND, enAPOSTROPHE = UNICODE_APOSTROPHE, enLEFTPARENTHESIS = UNICODE_LEFTPARENTHESIS, enRIGHTPARENTHESIS = UNICODE_RIGHTPARENTHESIS, enASTERISK = UNICODE_ASTERISK,
enPLUS = UNICODE_PLUS, enCOMMA = UNICODE_COMMA, enMINUS = UNICODE_MINUS, enDOT = UNICODE_DOT, enSLASH = UNICODE_SLASH, enCOLON = UNICODE_COLON, enSEMICOLON = UNICODE_SEMICOLON, enLESS_THAN_SIGN =
UNICODE_LESS_THAN_SIGN, enEQUALS_SIGN = UNICODE_EQUALS_SIGN, enGREATER_THAN_SIGN = UNICODE_GREATER_THAN_SIGN, enINTERROGATION = UNICODE_INTERROGATION, enQUESTION_MARK = UNICODE_QUESTION_MARK,
enCOMMERCIAL_AT = UNICODE_COMMERCIAL_AT, enLEFT_SQUARE_BRACKET = UNICODE_LEFT_SQUARE_BRACKET, enREVERSE_SOLIDUS = UNICODE_REVERSE_SOLIDUS, enBACK_SLASH = UNICODE_BACK_SLASH, enRIGHT_SQUARE_BRACKET =
UNICODE_RIGHT_SQUARE_BRACKET, enCIRCUMFLEX_ACCENT = UNICODE_CIRCUMFLEX_ACCENT, en_LOWLINE = UNICODE_LOWLINE, en_GRAVE_ACCENT = UNICODE_GRAVE_ACCENT, en_LEFT_CURLY_BRACKET =
UNICODE_LEFT_CURLY_BRACKET, en_VERTICAL_LINE = UNICODE_VERTICAL_LINE, en_RIGHT_CURLY_BRACKET = UNICODE_RIGHT_CURLY_BRACKET, en_DELETE = UNICODE_DELETE
};
enum WHITESPACE
{
enNULL = UNICODE_NULL, enEOF = UNICODE_EOF, enHT = UNICODE_HT, enSPACE = UNICODE_SPACE, enLINEFEED = UNICODE_LF
};
class TToken : public CMemoryPool<TToken>
{
public:
__int64 m_SPos;
__int64 m_EPos;
wchar_t* m_pStr;
AutomataState State;
};
class CAutoMata
{
private:
wchar_t* m_pRoot;
size_t m_Length;
size_t m_OffSet;
AutomataState AutoMataTable[65535];
public:
CAutoMata(wchar_t* pRoot, size_t size)
{
m_pRoot = pRoot;
m_Length = size;
m_OffSet = 0;
}
~CAutoMata()
{
}
TToken * GetNextToken(void);
void SetBuffer(wchar_t* pRoot, size_t size);
void InitTable(void);
private:
TToken* GetLiteralToken(size_t s_pos, AutomataState State);
TToken* GetNumericLiteral(size_t s_pos, AutomataState State);
};
#include "StdAfx.h"
#include "Windows.h"
#include "tokenizer.h"
TToken* CAutoMata::GetNextToken(void)
{
if (AutoMataTable[m_pRoot[m_OffSet]] == enWHITESPACE)
{
while (AutoMataTable[m_pRoot[m_OffSet++]] == enWHITESPACE);
m_OffSet -= 1;
}
AutomataState State = AutoMataTable[m_pRoot[m_OffSet]];
switch (State)
{
case enNUMERIC:
return GetNumericLiteral(m_OffSet, State);
default:
return GetLiteralToken(m_OffSet, State);
}
}
TToken* CAutoMata::GetLiteralToken(size_t s_pos, AutomataState State)
{
size_t SPos = s_pos;
size_t FPos = s_pos;
while (AutoMataTable[m_pRoot[++FPos]] == State)
{
}
if (FPos >= m_Length)
return NULL;
size_t Token_Len = FPos - SPos + 1;
TToken* Tok = new TToken();
Tok->m_pStr = new wchar_t[Token_Len];
ZeroMemory(Tok->m_pStr, Token_Len*sizeof(wchar_t));
wcsncpy(Tok->m_pStr, m_pRoot + SPos, Token_Len - 1);
Tok->m_SPos = SPos;
Tok->m_EPos = FPos;
Tok->State = State;
m_OffSet = FPos;
return Tok;
}
TToken* CAutoMata::GetNumericLiteral(size_t s_pos, AutomataState State)
{
bool bInNumeric = true;
size_t SPos = s_pos;
size_t FPos = s_pos;
while (AutoMataTable[m_pRoot[FPos]] == enNUMERIC)
{
FPos++;
}
if (FPos >= m_Length)
return NULL;
switch (AutoMataTable[m_pRoot[FPos]])
{
case enSYM:
{
switch (m_pRoot[FPos])
{
case enDOT:
if (AutoMataTable[m_pRoot[FPos + 1]] = enNUMERIC)
{
while (AutoMataTable[m_pRoot[++FPos]] != enNUMERIC)
{
}
size_t Token_Len = FPos - SPos + 1;
TToken* Tok = new TToken();
Tok->m_pStr = new wchar_t[Token_Len];
ZeroMemory(Tok->m_pStr, Token_Len*sizeof(wchar_t));
wcsncpy(Tok->m_pStr, m_pRoot + SPos, Token_Len - 1);
Tok->m_SPos = SPos;
Tok->m_EPos = FPos;
m_OffSet = FPos;
Tok->State = State;
return Tok;
}
break;
case enCOMMA:
{
int i = 0;
while (m_pRoot[FPos] == enCOMMA && AutoMataTable[m_pRoot[FPos + 1]] == enNUMERIC && AutoMataTable[m_pRoot[FPos + 2]] == enNUMERIC && AutoMataTable[m_pRoot[FPos + 3]] == enNUMERIC)
{
FPos += 4;
i++;
}
if (i == 0)
break;
size_t Token_Len = FPos - SPos + 1;
TToken* Tok = new TToken();
Tok->m_pStr = new wchar_t[Token_Len];
ZeroMemory(Tok->m_pStr, Token_Len*sizeof(wchar_t));
wcsncpy(Tok->m_pStr, m_pRoot + SPos, Token_Len - 1); //
m_pRoot + sPos;
Tok->m_SPos = SPos;
Tok->m_EPos = FPos;
m_OffSet = FPos;
Tok->State = State;
return Tok;
}
}
} break;
default:
break;
}
size_t Token_Len = FPos - SPos + 1;
TToken* Tok = new TToken();
if (Token_Len > 1)
{
Tok->m_pStr = new wchar_t[Token_Len];
ZeroMemory(Tok->m_pStr, Token_Len*sizeof(wchar_t));
wcsncpy(Tok->m_pStr, m_pRoot + SPos, Token_Len - 1);
// m_pRoot+sPos; Tok->m_SPos = SPos;
Tok->m_EPos = FPos;
m_OffSet = FPos;
}
else
{
Tok->m_pStr = new wchar_t[2];
ZeroMemory(Tok->m_pStr, 2*sizeof(wchar_t));
wcsncpy(Tok->m_pStr, m_pRoot + SPos, 1);
// m_pRoot+sPos;
Tok->m_SPos = SPos;
Tok->m_EPos = FPos;
m_OffSet = FPos + 1;
}
Tok->State = State;
return Tok;
}
void CAutoMata::InitTable(void)
{
wchar_t c;
for (c = 0xAC00; c <= 0xD7A3; c++)
AutoMataTable[c] = enKOR;
for (c = L'A'; c <= L'Z'; c++)
AutoMataTable[c] = enENG;
for (c = L'a'; c <= L'z'; c++)
AutoMataTable[c] = enENG;
for (c = L'0'; c <= L'9'; c++)
AutoMataTable[c] = enNUMERIC;
for (c = 0x3040; c <= 0x30FF; c++)
AutoMataTable[c] = enCJK;
for (c = 0x3400; c <= 0x4DB5; c++)
AutoMataTable[c] = enCJK;
for (c = 0x4E00; c <= 0x9FA5; c++)
AutoMataTable[c] = enCJK;
for (c = 0xF900; c <= 0xFA2D; c++)
AutoMataTable[c] = enCJK;
AutoMataTable[UNICODE_NULL] = enWHITESPACE;
AutoMataTable[UNICODE_EOF] = enWHITESPACE;
AutoMataTable[UNICODE_HT] = enWHITESPACE;
AutoMataTable[enSPACE] = enWHITESPACE;
AutoMataTable[enLINEFEED] = enWHITESPACE;
AutoMataTable[UNICODE_LF] = enWHITESPACE;
AutoMataTable[UNICODE_CR] = enWHITESPACE;
}
void CAutoMata::SetBuffer(wchar_t* pRoot, size_t size)
{
m_pRoot = pRoot;
m_Length = size;
}



![양파 - Elegy Nouveau [Mini Album]](http://image.aladin.co.kr/coveretc/music/coveroff/4775039288_2.jpg)








최근 덧글