Code.hppと.cpp

ここでは、2つのソース（.hと.cpp）のみ記載します。

ソースの説明はこちらを参照ください。

※昔に書いたソースの修正版です。（クラス名とクラス構成を変更しました）

※注意
下の方に2ファイル目もありますので、忘れずにコピーしてください。

Code.hppファイル


//Code.h
//
#ifndef __CODE_H__
#define __CODE_H__
 
 
#include <vector>
#include <stdexcept>
#include <memory>
#include "MCharIterator.hpp"
 
 
namespace nana{
  
 
using namespace std;
using namespace boost;
 
 
 
 
//===========文字コードを扱うクラス=======================
 
 
 
/**
SJISの一文字の長さを計算するクラス。
*/
class SjisCharSet : public CharSet
{
public:
  ///
  SjisCharSet(){};
  
  ///次の1文字を計算する
  const int calc(const char* p, CharSet::Type& type) const;
};
 
 
 
/**
EUCの一文字の長さを計算するクラス。
*/
class EucCharSet : public CharSet
{
public:
  ///
  EucCharSet(){};
  
  ///次の1文字を計算する
  const int calc(const char* p, CharSet::Type& type) const;

};
 
 
/**
UTF-8の一文字の長さを計算するクラス。
*/
class Utf8CharSet : public CharSet
{
public:
  ///
  Utf8CharSet(){};
  
  ///次の1文字を計算する
  const int calc(const char* p, CharSet::Type& type) const;
};
 
 
 
/**
<pre>
JISの一文字の長さを計算するクラス。
JISには制御文字が存在します。
このクラスが数える文字数は制御文字もカウントします。
ですので、文字の種類が制御文字により切り替わった場合、
その最初の文字は、制御文字3バイト+実際の文字のバイト数になります。
</pre>
*/
class JisCharSet : public CharSet
{
public:
  ///
  JisCharSet(){};
  
  ///次の1文字を計算する
  const int calc(const char* p, CharSet::Type& type) const;
  ///1文字比較する（Jisは特殊）
  const bool equals(const MChar& m1, const MChar& m2) const; 
};
 
  
 
 
//=======文字コード変換クラス：おまけ========
 
/**
固定長文字クラス。
1文字比較などで使用する。 
*/
class FixedCharSet : public CharSet
{
public:
  ///
  explicit FixedCharSet(const int& len)
  : m_len(len)
  {};
  
  ///次の1文字を計算する
  const int calc(const char* p, CharSet::Type& type) const
  { 
    type = NONE_KIND;
    return m_len;
  };
  
private:
  int m_len;
};
 
 

/**
<pre>
固定文字。 MChar と固定文字のアダプター。
注意：
　このクラスは内部に文字列を持つ。 
　MChar は文字位置の参照でしかないので、MCharにこのクラスを代入することは許されない。
　逆は問題ないので可能。（fixedChar = mchar;は可能） 
使用例：
  const char* st = "abcあdいe漏fg";
  //コピー先の文字列ストリーム
  ostringstream oss;
  ostream_iterator<FixedMChar> oite(oss);
  CharMCharIterator i = mcharIterator(st, getCharSet("sjis"));
  CharMCharIterator end;
 	
  //STLのアルゴリズム
  replace_copy(i, end, oite, FixedMChar("あ"), FixedMChar("#"));
 
  cout << oss.str() << ":replace" << endl << endl;
</pre>
*/
class FixedMChar {
public:
  FixedMChar(const char* str)
   : m_str(str)
  {};
 
  FixedMChar(const string& str)
   : m_str(str)
  {};
 
  FixedMChar(const MChar& mchar)
   : m_str(mchar.pos(), mchar.size())
  {};
	
  ///演算子
  const bool operator ==(const FixedMChar& fm)const{
    return (m_str == fm.m_str);
  };
  const bool operator ==(const MChar& m)const{
    if(m_str.size() != m.size()) return false;
    return (strncmp(m_str.data(), m.pos(), m_str.size()) == 0);
  };
  friend ostream& operator <<(ostream& out, const FixedMChar& fm){
    return out << fm.m_str;
  };
  friend const bool operator ==(const MChar& m, const FixedMChar& fm){
    return (fm == m);
  };
	
private:
  string m_str;
};
 
 
 
 
//-------------------------------------
 
/**
文字列を検索する<br>
@param begin [in]対象文字列のイテレータ。 
@param size [in]対象文字列のサイズ（バイト数）。
@param src [in]検索する文字列。
@return 見つかった位置i。target[i]。見つからなかったとき-1。 
*/
template <typename Iterator>
const int searchStr(const MCharIterator<Iterator>& begin, const int& size, const string& src)
{
	string::size_type n = src.size();
	const char* bp = begin->pos();
	const char* ep = bp + size - n;
 	
	//
	MCharIterator<Iterator> f, end;
	f = begin;
	for(; f != end; ++f){
		//終端まできたら終了
		if(f->pos() > ep) break;
		//文字列比較
		if(strncmp(f->pos(), src.data(), n)==0) return (f->pos() - bp);
	}
 	
	return -1;
};
 
 
 
/**
文字コード種別クラスのポインタを返す 。
@param code [in]sjis, euc, utf8, jis。小文字のみ。間違っている場合例外。 
@return codeに対応するクラス。deleteいないこと。
@exception invalid_argument codeが間違っている場合。
*/
const CharSet* getCharSet(const string& code) throw(invalid_argument);
 
 
/**
SjisからEUCに変換するクラス
*/
class Sjis2Euc
{
public:
  Sjis2Euc(ostream& os) : m_os(&os) {};
  //
  void operator () (const MChar& src);

private:
  ostream* m_os;
};
 
 
}; //namespace nana
 
#endif

Code.cppファイル


#include <boost/scoped_array.hpp>
#include "code.hpp"
 
 
namespace nana{
 
using namespace std;
using namespace boost;
 
 
//文字コード 
static const SjisCharSet g_sjis;
static const EucCharSet g_euc;
static const Utf8CharSet g_utf8;
static const JisCharSet g_jis;
//デフォルト値を設定
const CharSet* CharSetDefault::defaultCharSet = &g_sjis;
 
 
//CharSet生成ファクトリー .取得したクラスはdeleteしないこと。 
const CharSet* getCharSet(const string& code) throw(invalid_argument)
{
  if(code == "default") return CharSetDefault::get();
  if(code == "sjis") return &g_sjis;
  if(code == "euc") return &g_euc;
  if(code == "utf8") return &g_utf8;
  if(code == "jis") return &g_jis;
  throw invalid_argument( string("指定の文字コードが見つかりません。") + code);
};
 
 
 
//asciiコードを扱う（改行の考慮あり） 
inline const int _returnAscii(const unsigned char& c, const char* &p){
  
  if(c == 0x0A) return 1;
  if(c != 0x0D) return 1;
  if(p[1] == 0x0A) return 2;
  return 1;
}
 
 
/**
SJISの1文字のバイト数を数える
<pre>
制御コード 0x00～0x1F、0x7F
ASCII文字 0x20～0x7E
半角カタカナ 0xA1～0xDF
漢字 0x8140～0x9FFC、0xE040～0xFCFC
(第1バイト: 0x81～0x9F、0xE0～0xFC
第2バイト: 0x40～0x7E、0x80～0xFC)
</pre>*/
const int SjisCharSet::calc(const char* p, CharSet::Type& type) const
{
  const unsigned char c = *p;
  
  //漢字 第1バイト :
  if((c >= 0x81 && c<= 0x9F) || (c >= 0xE0 && c <= 0xFC)){
    type = KANJI;
    return 2;
  }
  
  type = ASCII;
  return _returnAscii(c, p);
};
 
 
/**
EUCの1文字のバイト数を数える
<pre>
制御コード 0x00～0x1F、0x7F
ASCII文字 0x20～0x7E
漢字 0xA1A1～0xFEFE (第1バイト・第2バイトとも0xA1～0xFE)
半角カタカナ 0x8EA1～0x8EDF
補助漢字 0x8FA1A1～0x8FFEFE (第2バイト・第3バイトとも0xA1～0xFE)
</pre>*/
const int EucCharSet::calc(const char* p, CharSet::Type& type) const
{
  const unsigned char c = *p;
  
  //漢字2バイト以上の文字
  if(c >= 0xA1 && c<= 0xFE){
    type = KANJI;
    return (c == 0x8F ? 3 : 2);
  }
  
  //1バイト文字
  type = ASCII;
  return _returnAscii(c, p);
};
 
 
/**
UTF-8の1文字のバイト数を数える
<pre>
1バイト目で判断します。
00-7F 1バイト文字
C0-DF 2バイト文字
E0-EF 3バイト文字
F0-F7 4バイト文字
F8-FB 5バイト文字
FC-FD 6バイト文字
</pre>*/
const int Utf8CharSet::calc(const char* p, CharSet::Type& type) const
{
  const unsigned char c = *p;
  type = KANJI;
  
  //漢字 第1バイト
  if(c <= 0x7F){
    type = ASCII;
    return _returnAscii(c, p);
  }else if(c >= 0xC0 && c<= 0xDF){
    return 2;
  }else if(c >= 0xE0 && c<= 0xEF){
    return 3;
  }else if(c >= 0xF0 && c<= 0xF7){
    return 4;
  }else if(c >= 0xF8 && c<= 0xFB){
    return 5;
  }else if(c >= 0xFC && c<= 0xFD){
    return 6;
  }
  
  //
  return 0;
};
 
 
 
/**
JISの1文字のバイト数を数える
<pre>
制御文字（3バイト）が2バイト文字、1バイト文字の始まりを示します。
ASCIIモードへ切替え \x1b(B   ⇒{0x1B, 0x28, 0x42}
全角日本語モードへ切替え \x1b$B　⇒ {0x1B, 0x24, 0x42}
半角日本語モードへ切替え \x1b(I　⇒ {0x1B, 0x28, 0x49}
</pre>*/
const int JisCharSet::calc(const char* p1, CharSet::Type& ptype) const
{
  const char* p = p1;
  unsigned char c = *p1;
  
  //制御文字
  if(c == 0x1B){
    c = *(++p);
    if(c == 0x24){ //'$'
      //全角の始まり
      ptype = KANJI;
      return 5;
    }
    
    //asciiの始まり
    unsigned char c2 = p[1];
    if(c == 0x28 && c2 == 0x42){
      ptype = ASCII;
      c = *(++(++p));
      return _returnAscii(c, p) + 3;
    }
    
    //半角日本語の始まり 
    ptype = KANA;
    return 4;
  }
  
  //今までと同じステータス
  switch(ptype){
  case ASCII:
    //ptype = ASCII;
    return _returnAscii(c, p);
  case KANJI:
    //ptype = KANJI;
    return 2;
  case KANA:
    //ptype = KANA;
    return 1;
  }
  
  return 0;
};
 
 
 
const bool JisCharSet::equals(const MChar& m1, const MChar& m2) const{
  if(m1.type() != m2.type()) return false;
  const char* p1 = m1.pos();
  int size1 = m1.size();
  const char* p2 = m2.pos();
  int size2 = m2.size();
 	
  if(p1 == p2) return true;
  if(p1 == NULL || p2 == NULL) return false;
 	
  //制御文字を除外する 
  unsigned char c1 = *p1;
  unsigned char c2 = *p2;
  if(c1 == 0x1B){
    p1 += 3;
    size1 -= 3;
  }
  if(c2 == 0x1B){
    p2 += 3;
    size2 -= 3;
  }
  if(size1 != size2) return false;
  return (strncmp(p1, p2, size1) == 0);
}; 
 
 
 
//==========文字コード変換=========================
 

///1文字を変換する
inline void sjis2euc(unsigned char& kj1, unsigned char& kj2)
{
  kj1 <<= 1;
  if(kj2 < 0x9F) {
    if(kj1 < 0x3F){ kj1 -= 0x61; }
    else{ kj1 += 0x1F;}
    
    if(kj2 > 0x7E){ kj2 += 0x60; }
    else{ kj2 += 0x61;}
  
  }else{
    if(kj1 < 0x3F){ kj1 -= 0x60; }
    else{ kj1 += 0x20; }
     
    kj2 += 0x02;
  }
};
 
 
///1文字を返した方がよくない？const string&で。
void Sjis2Euc::operator () (const MChar& src)
{
  const char* p = src.pos();
  unsigned char c1 = *p;
  //
  if(c1 < 0x80){
    m_os->put(c1);
    return;
  }
  if(0xA1 <= c1 && c1 <= 0xDF) {
    m_os->put((char)0x8E);
    m_os->put(c1);
    return;
  }
  
  //漢字の場合
  unsigned char c2 = *(++p);
  //
  sjis2euc(c1, c2);
  
  m_os->put(c1);
  m_os->put(c2);
};
 
 
}; //namespace nana