# k2uhr.pl # # You need 2 data files: # dec-unicode-wansung.txt # dec-unicode-camo.txt # # and you must modify 2 lines bellow to your right full-path: # # open(TABLEW, "dec-unicode-wansung.txt") # open(TABLEC, "dec-unicode-camo.txt") # # USAGE: # perl k2uhr # and you can get output file "file.out". # # Input file has euc-kr(ks code) characters in it, # and output file "file.out" has # decimal Unicode hangul (charcter entity reference??), # euc-kr, and HR. Each line is tripled. # Englich (need to wrapped in "[" and "]") $alphabet = "-*abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; # camo data @hr_camo = ("k", "kk", "ks", "n", "nj", "nh", "t", "tt", "r", "rk", "rm", "rp", "rs", "rth", "rph", "rh", "m", "p", "pp", "ps", "s", "ss", "0", "j", "jj", "ch", "kh", "th", "ph", "h", "a", "ae", "ya", "yae", "eo", "e", "yeo", "ye", "o", "wa", "wae", "oe", "yo", "u", "wo", "we", "wi", "yu", "eu", "eui", "i"); # cheos-so-ri data @hr_first = ("k", "kk", "n", "t", "tt", "r", "m", "p", "pp", "s", "ss", "", "j", "jj", "ch", "kh", "th", "ph", "h"); # ka-un-tes-so-ri data @hr_second = ("a", "ae", "ya", "yae", "eo", "e", "yeo", "ye", "o", "wa", "wae", "oe", "yo", "u", "wo", "we", "wi", "yu", "eu", "eui", "i"); # kkeus-so-ri data @hr_third = ("", "k", "kk", "ks", "n", "nj", "nh", "t", "l", "lk", "lm", "lp", "ls", "lth", "lph", "lh", "m", "p", "ps", "s", "ss", "ng", "j", "ch", "kh", "th", "ph", "h"); $kugiri_moji = "-"; $tandoku_moji = "*"; $hr_hangul_off = "["; $hr_hangul_on = "]"; $flag_hangul = 1; # not English mode $flag_hr = 0; # the last char in hr_buffer is HR $char_nature = ""; # 0:English 1:symbols 2:Hangul $u_str = ""; # to Unicode number $hr_str = ""; # to HR $hr_buffer = ""; # to HR (buffer) # to make data: from ks hangul char to Unicode number %dec_u = (); open(TABLEW, "dec-unicode-wansung.txt") # PLEASE MODIFY || die "Can't open dec-unicode-wansung.txt: $!\n"; while(){ ($a, $b) = split(' ', $_); $hex_code = &get_hex_code( $b ); $dec_u{$hex_code} = $a; } close TABLEW; open(TABLEC, "dec-unicode-camo.txt") # PLEASE MODIFY || die "Can't open dec-unicode-camo.txt: $!\n"; while(){ ($a, $b) = split(' ', $_); $hex_code = &get_hex_code( $b ); $dec_u{$hex_code} = $a; } close TABLEC; # MAIN PROGRAM open(OUTFILE, '>file.out'); while(<>){ # read each line $u_str = ""; $hr_str = ""; $hr_buffer = ""; $flag_hangul = 1; $flag_hr = 0; chomp; # chop the last "\n" $ks_str = $_; # $ks_str is no-conversion-str (so ks code) $in_str = $ks_str; ($char_nature, $head_char, $in_str) = &get_first_char($in_str); while($char_nature ne "ERROR") { if($char_nature == 0) { # case: English char $flag_hr = 0; $u_str = $u_str . $head_char; if($flag_hangul){ $flag_hangul = 0; $hr_str = $hr_str . $hr_buffer; $hr_buffer = ""; } $hr_buffer = $hr_buffer . $head_char; } elsif($char_nature == 1) { # case: Symbol $flag_hr = 0; $u_str = $u_str . $head_char; $hr_buffer = $hr_buffer . $head_char; } else { # case: Hangul $u_code = &to_u($head_char); if($u_code eq "ERROR") { # when fail to convert to Unicode $u_str = $u_str . $head_char; $hr_buffer = $hr_buffer . $head_char; } else { # when succeed to convert to Unicode $u_str = $u_str . '&#' . $u_code . ';'; unless($flag_hangul) { $flag_hangul = 1; $hr_str = $hr_str . $hr_hangul_off . $hr_buffer . $hr_hangul_on; $hr_buffer = ""; } if($flag_hr) { $hr_buffer = $hr_buffer . $kugiri_moji; } $hr_buffer = $hr_buffer . &to_hr($u_code); $flag_hr = 1; } } ($char_nature, $head_char, $in_str) = &get_first_char($in_str); } # flash the buffer if($flag_hangul) { $hr_str = $hr_str . $hr_buffer; } else { $hr_str = $hr_str . $hr_hangul_off . $hr_buffer . $hr_hangul_on; } # print out print OUTFILE $u_str; print OUTFILE "\n"; print OUTFILE $ks_str; print OUTFILE "\n"; print OUTFILE $hr_str; print OUTFILE "\n"; print OUTFILE "\n"; } close(OUTFILE); sub get_hex_code { # from 2byte char to its code(hex) local(@code); local(@code) = unpack("C2", $_[0]); sprintf("%X%X",@code); } sub get_first_char { # from str to its head char etc. local($source_str) = $_[0]; local($l) = length($source_str); if($l > 0){ # not null str local($head_char) = substr($source_str, 0, 1); local(@char_num) = unpack("c", $head_char); local($minusp) = substr($char_num[0], 0, 1); if($minusp ne '-'){ # 7bit char if(index($alphabet, $head_char) != -1){ # case: English char (0, $head_char, substr($source_str, 1)); } else { # case: Symbol (1, $head_char, substr($source_str, 1)); } } else { # 8bit char if($l > 1){ # 2byte char (Hangul) $head_char = substr($source_str, 0, 2); (2, $head_char, substr($source_str, 2)); } else { # 1byte char (not Hangul) ("ERROR"); } } } else { # null str ("ERROR"); } } sub to3bul { # from wansung Unicode to 3bul code local($dec) = $_[0]; $dec = $dec - 44032; local($tmp) = 0; local($third_code) = $dec % 28; $tmp = ($dec - $third_code) / 28; local($second_code) = $tmp % 21; local($first_code) = ($tmp - $second_code) / 21; ($first_code, $second_code, $third_code); } sub to_camo { # from camo Unicode to camo code $_[0] - 12593; } sub to_u { # from ks char to its (decimal) Unicode local($hex) = &get_hex_code($_[0]); if($dec_u{$hex} ne "") { $dec_u{$hex}; } else { "ERROR"; } } sub to_hr { # from (decimal) Unicode Hangul to HR local($code) = $_[0]; if($code >= 12593 && $code <= 12643) { $tandoku_moji . $hr_camo[&to_camo($code)]; } elsif ($code >= 44032 && $code <= 55203) { local(@wansung_code) = to3bul($code); $hr_first[$wansung_code[0]] . $hr_second[$wansung_code[1]] . $hr_third[$wansung_code[2]]; } else { "ERROR"; } }