StringTools autodecode

The viewers can automatically decode these encodings, so I request that StringTools' decoding be enhanced. (Instead of having to study redundant JScript code.)

Test script
function OnClick(clickData) {
    var tab = clickData.func.sourcetab;
    var fsu = DOpus.FSUtil();
	var ST = DOpus.Create.StringTools();
    
    for (var e = new Enumerator(tab.selected_files); !e.atEnd(); e.moveNext()) {
        var file = e.item();
		var filePath = file.realpath;
		if (fsu.Exists(filePath))
		{
			var fileContent = "";
			var openText = fsu.OpenFile(filePath, "r");
			var text = openText.Read();
			openText.Close;
			var textDecode = ST.Decode(text, "auto");   //var textDecode = ST.Decode(text, "utf8");
			var textContent = ST.Truncate(textDecode, 2048, 0);
			DOpus.Output(textContent)
		}
    }
}

Result:

Format			Decode: utf-8    Decode: auto
UTF-8			√				 × (Throwing Error, ANSI too)
UTF-8 with BOM	√				 √
UTF-16 BE		×				 √
UTF-16 LE		×				 √

There’s no reliable way to auto-detect how text is encoded if it doesn’t start with a BOM, and many famous examples of what can go wrong if you try.

Try to recognize more code pages, but the more there are, the more errors will be caused. :upside_down_face:
I deleted some code pages, in addition to the above code pages, it also supports gb2312, big5, Shift-JIS (ST.Decode cannot recognize?), but it is enough for me.

function OnClick(clickData) { 
    var tab = clickData.func.sourcetab;
    var fsu = DOpus.FSUtil();
    var ST = DOpus.Create.StringTools();
    
    // Loop through all selected files in the source tab
    for (var e = new Enumerator(tab.selected_files); !e.atEnd(); e.moveNext()) {
        var file = e.item();
        var filePath = file.realpath;
        
        // Check if the file exists
        if (fsu.Exists(filePath)) {
			var encoding = detectFileEncoding(filePath);
			var sysEnc = getSystemANSIEncoding();
			DOpus.Output('File Encoding: ' + encoding + '\nIs ANSI: ' + (encoding == sysEnc ? "Yes" : "No"));
			
			// Read the file content
			var openText = fsu.OpenFile(filePath, "r");
			var bytes = openText.Read();
			openText.Close();
			
			var textDecode = ST.Decode(bytes, encoding);
			var textContent = ST.Truncate(textDecode, 2048, 0);
			DOpus.Output('Content:\n' + textContent);
        }
    }
}

// Detect the file encoding based on BOM and byte patterns
function detectFileEncoding(fp) {
    try {
        var stream = new ActiveXObject("ADODB.Stream");
        stream.Type = 1;
        stream.Open();
        stream.LoadFromFile(fp);
        try { var bytes = new VBArray(stream.Read(4096)).toArray(); }
		catch (error) { return "UTF-8" }
        stream.Close();
        // ================== BOM Detection ==================
        if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) return "UTF-8";
        if (bytes[0] === 0xFF && bytes[1] === 0xFE) return "UTF-16 LE";
        if (bytes[0] === 0xFE && bytes[1] === 0xFF) return "UTF-16 BE";
        // ================== Encoding Feature Detection ==================
        var scores = { utf8: 0, gbk: 0, big5: 0, euckr: 0, sjis: 0 };
        var sysEnc = getSystemANSIEncoding();
        
        for (var i = 0; i < bytes.length; i++) {
            var b1 = bytes[i];
            
            // ------------------ UTF-8 Strict Validation ------------------
            if (b1 < 0x80) {
                scores.utf8 += 0.1; // Lower ASCII weight
            } else if ((b1 & 0xE0) === 0xC0) { // 2-byte sequence
                if (i + 1 >= bytes.length) break;
                var b2 = bytes[++i];
                if ((b2 & 0xC0) === 0x80) {
                    if (b1 >= 0xC2 && b1 <= 0xF4) scores.utf8 += 1;
                }
            } else if ((b1 & 0xF0) === 0xE0) { // 3-byte sequence
                if (i + 2 >= bytes.length) break;
                var b2 = bytes[++i], b3 = bytes[++i];
                if ((b2 & 0xC0) === 0x80 && (b3 & 0xC0) === 0x80) {
                    if (!(b1 === 0xED && b2 >= 0xA0)) scores.utf8 += 2;
                }
            }
            
            // ------------------ Double-byte Encoding Detection ------------------
            if (++i >= bytes.length) break;
            var b2 = bytes[i];
            // GBK precise detection (only GB2312 core area)
            if (
                (b1 >= 0xB0 && b1 <= 0xF7 && b2 >= 0xA1 && b2 <= 0xFE) ||
                (b1 >= 0xA1 && b1 <= 0xA9 && b2 >= 0xA1 && b2 <= 0xA3)
            ) {
                scores.gbk += 2;
            }
            // Big5 precise detection (high-frequency character area)
            else if (
                (b1 >= 0xA4 && b1 <= 0xC6 && b2 >= 0x40 && b2 <= 0x7E) ||
                (b1 >= 0xC9 && b1 <= 0xF9 && b2 >= 0x40 && b2 <= 0x6B)
            ) {
                scores.big5 += 2;
            }
            // Shift-JIS precise detection (including common Japanese characters)
            else if (
                (b1 >= 0x81 && b1 <= 0x9F && b2 >= 0x40 && b2 <= 0x7E) || // Japanese characters
                (b1 >= 0xE0 && b1 <= 0xFC && b2 >= 0x40 && b2 <= 0x7E)
            ) {
                scores.sjis += 3; // Shift-JIS weight
            }
        }
        // ================== Dynamic Judgment Logic ==================
        var maxScore = Math.max(scores.utf8, scores.gbk, scores.big5, scores.euckr, scores.sjis);
        // Judgment rules (priority: system encoding > Shift-JIS > Big5 > UTF-8 > GBK)
        if (maxScore < 2) return sysEnc; // No significant features
        
        if (scores.sjis >= 3 && scores.sjis >= maxScore * 0.8) return "Shift-JIS";
        if (scores.big5 >= 3 && scores.big5 >= maxScore * 0.8) return "big5";
        if (scores.utf8 >= 3 && scores.utf8 >= maxScore * 0.9) return "UTF-8";
        if (scores.gbk >= 2 && sysEnc === "gb2312") return "gb2312";
        
        return sysEnc; // Default fallback
    } catch (e) {
        return "UTF-8";
    }
}

// Get the system ANSI encoding (enhanced version)
function getSystemANSIEncoding() {
    try {
        var acp = new ActiveXObject("WScript.Shell")
            .RegRead("HKLM\\SYSTEM\\CurrentControlSet\\Control\\Nls\\CodePage\\ACP");
        // Extended encoding mapping
        var encMap = { 
            "936": "gb2312",   // Simplified Chinese
            "950": "big5",     // Traditional Chinese
            "932": "Shift_JIS" // Japanese
        };
        return encMap[acp] || "UTF-8";
    } catch(e) {
        return "UTF-8";
    }
}

Windows 11 24H2

1 Like

@WKen Is your autodecode function still the same or did you improve it in the meantime - or perhaps replace it with something else? I couldn’t find other approaches on the forum, at least not with a quick search.

In its current form it certainly does not work in all cases, but it did work in others so I found it useful. I was surprised to learn how tricky recognition can be, given that tools like Notepad++, or the good old UltraEdit, seem to recognize formats flawlessly.

That's enough for me, I didn't improve it further.

function OnClick(clickData) { 
    var tab = clickData.func.sourcetab;
    var fsu = DOpus.FSUtil();
    var ST = DOpus.Create.StringTools();
    
    // Loop through all selected files in the source tab
    for (var e = new Enumerator(tab.selected_files); !e.atEnd(); e.moveNext()) {
        var file = e.item();
        var filePath = file.realpath;
        
        // Check if the file exists
        if (fsu.Exists(filePath)) {
			var Bytes_To_Read = 2048;
			var ST = DOpus.Create.StringTools();
			var openText = fsu.OpenFile(file, "r");
			var bytes = openText.Read(Bytes_To_Read);
			openText.Close;
			var encoding = "auto";
			try { var textContent = ST.Decode(bytes, encoding) }
			catch (error) {
				var encoding = detectFileEncoding(file);
				if (encoding == "unsupported") {var textContent = "-System version is too old, cannot decode-"}
				else {
					var textContent = ST.Decode(bytes, encoding);
				}
			}
			textContent = ST.Truncate(textContent, Bytes_To_Read - 100, 0);   // Truncated display...
			var sysEnc = getSystemANSIEncoding();
			DOpus.Output('File Encoding: ' + encoding + '\nIs ANSI: ' + (encoding == sysEnc ? "Yes" : "No"));
			
			DOpus.Output('Content:\n' + textContent);
        }
    }
}

// Detect the file encoding based on BOM and byte patterns
function detectFileEncoding(fp) { 
	var stream = new ActiveXObject("ADODB.Stream");
	stream.Type = 1;
	stream.Open();
	stream.LoadFromFile(fp);
	if (stream.Size == 0) {
		stream.Close();
		return "UTF-8"
	} else {
		try { var bytes = new VBArray(stream.Read(4096)).toArray(); }
		catch (error) {
			stream.Close();
			DOpus.Output("System version is too old, cannot decode: " + error);
			return "unsupported"
		}
	}
	stream.Close();
	// ================== BOM detection ==================
	if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) return "UTF-8";
	if (bytes[0] === 0xFF && bytes[1] === 0xFE) return "UTF-16 LE";
	if (bytes[0] === 0xFE && bytes[1] === 0xFF) return "UTF-16 BE";
	// ================== Encoding feature detection ==================
	var scores = { utf8: 0, gbk: 0, big5: 0, euckr: 0, sjis: 0 };
	var sysEnc = getSystemANSIEncoding();
	
	for (var i = 0; i < bytes.length; i++) {
		var b1 = bytes[i];
		
		// ------------------ Strict UTF-8 validation ------------------
		if (b1 < 0x80) {
			scores.utf8 += 0.1; // Reduced ASCII weight
		} else if ((b1 & 0xE0) === 0xC0) { // 2-byte sequence
			if (i + 1 >= bytes.length) break;
			var b2 = bytes[++i];
			if ((b2 & 0xC0) === 0x80) {
				if (b1 >= 0xC2 && b1 <= 0xF4) scores.utf8 += 1;
			}
		} else if ((b1 & 0xF0) === 0xE0) { // 3-byte sequence
			if (i + 2 >= bytes.length) break;
			var b2 = bytes[++i], b3 = bytes[++i];
			if ((b2 & 0xC0) === 0x80 && (b3 & 0xC0) === 0x80) {
				if (!(b1 === 0xED && b2 >= 0xA0)) scores.utf8 += 2;
			}
		}
		
		// ------------------ Double-byte encoding detection ------------------
		if (++i >= bytes.length) break;
		var b2 = bytes[i];
		// GBK precise detection (GB2312 core area only)
		if (
			(b1 >= 0xB0 && b1 <= 0xF7 && b2 >= 0xA1 && b2 <= 0xFE) ||
			(b1 >= 0xA1 && b1 <= 0xA9 && b2 >= 0xA1 && b2 <= 0xA3)
		) {
			scores.gbk += 2;
		}
		// Big5 precise detection (high-frequency character ranges)
		else if (
			(b1 >= 0xA4 && b1 <= 0xC6 && b2 >= 0x40 && b2 <= 0x7E) ||
			(b1 >= 0xC9 && b1 <= 0xF9 && b2 >= 0x40 && b2 <= 0x6B)
		) {
			scores.big5 += 2;
		}
		// Shift-JIS precise detection (including common Japanese character sets)
		else if (
			(b1 >= 0x81 && b1 <= 0x9F && b2 >= 0x40 && b2 <= 0x7E) || // Japanese characters
			(b1 >= 0xE0 && b1 <= 0xFC && b2 >= 0x40 && b2 <= 0x7E)
		) {
			scores.sjis += 3; // Shift-JIS weight
		}
	}
	// ================== Dynamic decision logic ==================
	var maxScore = Math.max(scores.utf8, scores.gbk, scores.big5, scores.euckr, scores.sjis);
	// Decision rules (priority: system encoding > Shift-JIS > Big5 > UTF-8 > GBK)
	if (maxScore < 2) return sysEnc; // No significant features detected
	
	if (scores.sjis >= 3 && scores.sjis >= maxScore * 0.8) return 932;
	if (scores.big5 >= 3 && scores.big5 >= maxScore * 0.8) return "big5";
	if (scores.utf8 >= 3 && scores.utf8 >= maxScore * 0.9) return "UTF-8";
	if (scores.gbk >= 2 && sysEnc === "gb2312") return "gb2312";
	
	return sysEnc; // Default fallback
}

// Get the system ANSI encoding (enhanced version)
function getSystemANSIEncoding() {
    try {
        var acp = new ActiveXObject("WScript.Shell")
            .RegRead("HKLM\\SYSTEM\\CurrentControlSet\\Control\\Nls\\CodePage\\ACP");
        // Extended encoding mapping
        var encMap = { 
            "936": "gb2312",   // Simplified Chinese
            "950": "big5",     // Traditional Chinese
            "932": "Shift_JIS" // Japanese
        };
        return encMap[acp] || "UTF-8";
    } catch(e) {
        return "UTF-8";
    }
}
1 Like

But… if you already opened the file and read the blob with File.Read(), why are you opening and reading it again using ActiveX?

And wouldn't it be enough for anyone who needs to detect encodings to just use something like uchardet (which I think is what Notepad++ uses) and move on?

That’s not a universal solution because the byte types are different.

Yes, there is feedback:

:face_with_diagonal_mouth:

I'm not sure if I'm following you or if its the other way around.
I was talking about that you don't need to open and read the file twice. You don't need the ADODB thing either.

Again, I was talking about using it in your own script, not as a built-in feature in Opus API.

It seems that ADODB cannot be replaced, the results in the detectFileEncoding() function are inconsistent, and I cannot resolve this.

I am not a programmer, and I try to avoid external dependencies.