StringTools autodecode

WKen · February 13, 2025, 4:30pm

The viewers can automatically decode these encodings, so I request that StringTools' decoding be enhanced. (Instead of having to study redundant JScript code.)

Test script

function OnClick(clickData) {
    var tab = clickData.func.sourcetab;
    var fsu = DOpus.FSUtil();
	var ST = DOpus.Create.StringTools();
    
    for (var e = new Enumerator(tab.selected_files); !e.atEnd(); e.moveNext()) {
        var file = e.item();
		var filePath = file.realpath;
		if (fsu.Exists(filePath))
		{
			var fileContent = "";
			var openText = fsu.OpenFile(filePath, "r");
			var text = openText.Read();
			openText.Close;
			var textDecode = ST.Decode(text, "auto");   //var textDecode = ST.Decode(text, "utf8");
			var textContent = ST.Truncate(textDecode, 2048, 0);
			DOpus.Output(textContent)
		}
    }
}

Result:

Format			Decode: utf-8    Decode: auto
UTF-8			√				 × (Throwing Error, ANSI too)
UTF-8 with BOM	√				 √
UTF-16 BE		×				 √
UTF-16 LE		×				 √

Leo · February 13, 2025, 4:37pm

There’s no reliable way to auto-detect how text is encoded if it doesn’t start with a BOM, and many famous examples of what can go wrong if you try.

WKen · February 13, 2025, 5:34pm

Try to recognize more code pages, but the more there are, the more errors will be caused.
I deleted some code pages, in addition to the above code pages, it also supports gb2312, big5, Shift-JIS (ST.Decode cannot recognize?), but it is enough for me.

function OnClick(clickData) { 
    var tab = clickData.func.sourcetab;
    var fsu = DOpus.FSUtil();
    var ST = DOpus.Create.StringTools();
    
    // Loop through all selected files in the source tab
    for (var e = new Enumerator(tab.selected_files); !e.atEnd(); e.moveNext()) {
        var file = e.item();
        var filePath = file.realpath;
        
        // Check if the file exists
        if (fsu.Exists(filePath)) {
			var encoding = detectFileEncoding(filePath);
			var sysEnc = getSystemANSIEncoding();
			DOpus.Output('File Encoding: ' + encoding + '\nIs ANSI: ' + (encoding == sysEnc ? "Yes" : "No"));
			
			// Read the file content
			var openText = fsu.OpenFile(filePath, "r");
			var bytes = openText.Read();
			openText.Close();
			
			var textDecode = ST.Decode(bytes, encoding);
			var textContent = ST.Truncate(textDecode, 2048, 0);
			DOpus.Output('Content:\n' + textContent);
        }
    }
}

// Detect the file encoding based on BOM and byte patterns
function detectFileEncoding(fp) {
    try {
        var stream = new ActiveXObject("ADODB.Stream");
        stream.Type = 1;
        stream.Open();
        stream.LoadFromFile(fp);
        try { var bytes = new VBArray(stream.Read(4096)).toArray(); }
		catch (error) { return "UTF-8" }
        stream.Close();
        // ================== BOM Detection ==================
        if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) return "UTF-8";
        if (bytes[0] === 0xFF && bytes[1] === 0xFE) return "UTF-16 LE";
        if (bytes[0] === 0xFE && bytes[1] === 0xFF) return "UTF-16 BE";
        // ================== Encoding Feature Detection ==================
        var scores = { utf8: 0, gbk: 0, big5: 0, euckr: 0, sjis: 0 };
        var sysEnc = getSystemANSIEncoding();
        
        for (var i = 0; i < bytes.length; i++) {
            var b1 = bytes[i];
            
            // ------------------ UTF-8 Strict Validation ------------------
            if (b1 < 0x80) {
                scores.utf8 += 0.1; // Lower ASCII weight
            } else if ((b1 & 0xE0) === 0xC0) { // 2-byte sequence
                if (i + 1 >= bytes.length) break;
                var b2 = bytes[++i];
                if ((b2 & 0xC0) === 0x80) {
                    if (b1 >= 0xC2 && b1 <= 0xF4) scores.utf8 += 1;
                }
            } else if ((b1 & 0xF0) === 0xE0) { // 3-byte sequence
                if (i + 2 >= bytes.length) break;
                var b2 = bytes[++i], b3 = bytes[++i];
                if ((b2 & 0xC0) === 0x80 && (b3 & 0xC0) === 0x80) {
                    if (!(b1 === 0xED && b2 >= 0xA0)) scores.utf8 += 2;
                }
            }
            
            // ------------------ Double-byte Encoding Detection ------------------
            if (++i >= bytes.length) break;
            var b2 = bytes[i];
            // GBK precise detection (only GB2312 core area)
            if (
                (b1 >= 0xB0 && b1 <= 0xF7 && b2 >= 0xA1 && b2 <= 0xFE) ||
                (b1 >= 0xA1 && b1 <= 0xA9 && b2 >= 0xA1 && b2 <= 0xA3)
            ) {
                scores.gbk += 2;
            }
            // Big5 precise detection (high-frequency character area)
            else if (
                (b1 >= 0xA4 && b1 <= 0xC6 && b2 >= 0x40 && b2 <= 0x7E) ||
                (b1 >= 0xC9 && b1 <= 0xF9 && b2 >= 0x40 && b2 <= 0x6B)
            ) {
                scores.big5 += 2;
            }
            // Shift-JIS precise detection (including common Japanese characters)
            else if (
                (b1 >= 0x81 && b1 <= 0x9F && b2 >= 0x40 && b2 <= 0x7E) || // Japanese characters
                (b1 >= 0xE0 && b1 <= 0xFC && b2 >= 0x40 && b2 <= 0x7E)
            ) {
                scores.sjis += 3; // Shift-JIS weight
            }
        }
        // ================== Dynamic Judgment Logic ==================
        var maxScore = Math.max(scores.utf8, scores.gbk, scores.big5, scores.euckr, scores.sjis);
        // Judgment rules (priority: system encoding > Shift-JIS > Big5 > UTF-8 > GBK)
        if (maxScore < 2) return sysEnc; // No significant features
        
        if (scores.sjis >= 3 && scores.sjis >= maxScore * 0.8) return "Shift-JIS";
        if (scores.big5 >= 3 && scores.big5 >= maxScore * 0.8) return "big5";
        if (scores.utf8 >= 3 && scores.utf8 >= maxScore * 0.9) return "UTF-8";
        if (scores.gbk >= 2 && sysEnc === "gb2312") return "gb2312";
        
        return sysEnc; // Default fallback
    } catch (e) {
        return "UTF-8";
    }
}

// Get the system ANSI encoding (enhanced version)
function getSystemANSIEncoding() {
    try {
        var acp = new ActiveXObject("WScript.Shell")
            .RegRead("HKLM\\SYSTEM\\CurrentControlSet\\Control\\Nls\\CodePage\\ACP");
        // Extended encoding mapping
        var encMap = { 
            "936": "gb2312",   // Simplified Chinese
            "950": "big5",     // Traditional Chinese
            "932": "Shift_JIS" // Japanese
        };
        return encMap[acp] || "UTF-8";
    } catch(e) {
        return "UTF-8";
    }
}

Windows 11 24H2