ICUでShift-JIS, EUC-JP, UTF-8の相互変換

コード

ヘッダ

#ifndef string_encoder_hpp
#define string_encoder_hpp

#include <string>

namespace encoding {
    class Encoder{
    public:
        // From EUC-JP
        static std::string EucToSjis(const std::string &value);
        static std::string EucToUtf8(const std::string &value);

        // From Shift-JIS
        static std::string SjisToEuc(const std::string &value);
        static std::string SjisToUtf8(const std::string &value);

        // From UTF-8
        static std::string Utf8ToEuc(const std::string &value);
        static std::string Utf8ToSjis(const std::string &value);
    };
}

#endif /* string_encoder_hpp */

実装

#include <vector>

#include <unicode/unistr.h>

#include "string_encoder.hpp"

namespace encoding {
    namespace internal {
        namespace encode_name {
            //! EUC-JP
            static const std::string kEucJp     = "euc-jp";
            //! Shift-JIS
            static const std::string kShiftJis  = "shift-jis";
            //! UTF8
            static const std::string kUtf8      = "utf8";
        }

        std::string encode(const std::string &value, const std::string &from, const std::string &to){
            icu::UnicodeString src(value.c_str(), from.c_str());

            // 出力バッファにnullptrを渡し変換後のバイト数が取得する
            const int length = src.extract(0, src.length(), nullptr, to.c_str());

            // 変換
            std::vector<char> result(length + 1);
            src.extract(0, src.length(), &result[0], to.c_str());

            return std::move(std::string(result.begin(), result.end() - 1));
        }
    }

    //! From EUC-JP To sjis
    std::string Encoder::EucToSjis(const std::string &value){
        return std::move(internal::encode(value,
                    internal::encode_name::kEucJp,
                    internal::encode_name::kShiftJis));
    }
    //! From EUC-JP To UTF-8
    std::string Encoder::EucToUtf8(const std::string &value){
        return std::move(internal::encode(value,
                    internal::encode_name::kEucJp,
                    internal::encode_name::kUtf8));
    }

    //! From sjis To EUC-JP
    std::string Encoder::SjisToEuc(const std::string &value){
        return std::move(internal::encode(value,
                    internal::encode_name::kShiftJis,
                    internal::encode_name::kEucJp));
    }
    //! From sjis To UTF-8
    std::string Encoder::SjisToUtf8(const std::string &value){
        return std::move(internal::encode(value,
                    internal::encode_name::kShiftJis,
                    internal::encode_name::kUtf8));
    }

    //! From UTF-8 To EUC-JP
    std::string Encoder::Utf8ToEuc(const std::string &value){
        return std::move(internal::encode(value,
                    internal::encode_name::kUtf8,
                    internal::encode_name::kEucJp));
    }
    //! From UTF-8 To sjis
    std::string Encoder::Utf8ToSjis(const std::string &value){
        return std::move(internal::encode(value,
                    internal::encode_name::kUtf8,
                    internal::encode_name::kShiftJis));
    }
}

動作確認

#include <iostream>
#include <string>
#include <vector>

#include "string_encoder.hpp"

template <typename T>
std::string test(const T &expected, const T &actual){
    return (expected == actual ? "Match" : "Unmatch!");
}

int main(int argc, const char * argv[]) {
    const std::string utf8_string = "aこれはウにこーど";
    std::cout << utf8_string << " is utf8 string, length:" << utf8_string.length() << std::endl;
    std::cout << std::endl;

    // utf8     -> sjis
    const std::string sjis_string = encoding::Encoder::Utf8ToSjis(utf8_string);

    // binary of "aこれはウにこーど" by sjis
    const std::string sjis_dump = {
        'a',
        '\x82', '\xb1', // "こ"
        '\x82', '\xea', // "れ"
        '\x82', '\xcd', // "は"
        '\xb3',         // "ウ"
        '\x82', '\xc9', // "に"
        '\x82', '\xb1', // "こ"
        '\x81', '\x5b', // "ー"
        '\x82', '\xc7'  // "ど"
    };
    std::cout << "test sjis is " << test(sjis_dump, sjis_string) << std::endl;

    // sjis     -> eucjp
    const std::string euc_string = encoding::Encoder::SjisToEuc(sjis_string);
    // binary of "aこれはウにこーど" by eucjp
    const std::string euc_dump = {
        'a',
        '\xa4', '\xb3', // "こ"
        '\xa4', '\xec', // "れ"
        '\xa4', '\xcf', // "は"
        '\x8e', '\xb3', // "ウ"
        '\xa4', '\xcb', // "に"
        '\xa4', '\xb3', // "こ"
        '\xa1', '\xbc', // "ー"
        '\xa4', '\xc9'  // "ど"
    };
    std::cout << "test euc  is " << test(euc_dump, euc_string) << std::endl;

    // eucjp    -> utf8
    const std::string return_utf8 = encoding::Encoder::EucToUtf8(euc_string);

    // check
    std::cout << "test utf8 is " << test(utf8_string, return_utf8) << std::endl;

    // 機種依存文字
    const std::string euc_contains_machine_dependent_char_string = {
        'a',
        '\xa4', '\xb3',         // "こ"
        '\xa4', '\xec',         // "れ"
        '\xa4', '\xcf',         // "は"
        '\xad', '\xc0', '\x0a', // "㍉"
        '\xa4', '\xcb',         // "に"
        '\xa4', '\xb3',         // "こ"
        '\xa1', '\xbc',         // "ー"
        '\xa4', '\xc9'          // "ど"
    };
    const std::string ㍉ = encoding::Encoder::EucToUtf8(euc_contains_machine_dependent_char_string);
    std::cout << "test ㍉    is " << test(euc_contains_machine_dependent_char_string, encoding::Encoder::Utf8ToEuc(㍉)) << std::endl;

    // 絵文字はsjisに含まれていないので変換できない
    const std::string 🍣 = {
        '\x3f', '\x0a'
    };
    const std::string sushi_string = "🍣";
    std::cout << "test 🍣   is " << test(sushi_string, encoding::Encoder::SjisToUtf8(🍣)) << std::endl;
}

結果

aこれはウにこーど is utf8 string, length:25

test sjis is Match
test euc  is Match
test utf8 is Match
test ㍉    is Match
test 🍣   is Unmatch!

参考文献