Column text file encoding

option explicit

' Text Encode
' (c) 2016 qiuqi

' This is a script for Directory Opus.
' See http://www.gpsoft.com.au/DScripts/redirect.asp?page=scripts for development information.
'
'

'
'Here's the file "Hello" in various encodings:
'48 65 6C 6C 6F
'This is the traditional ANSI encoding.

'48 00 65 00 6C 00 6C 00 6F 00
'This is the Unicode (little-endian) encoding with no BOM.

'FF FE 48 00 65 00 6C 00 6C 00 6F 00
'This is the Unicode (little-endian) encoding with BOM. The BOM (FF FE) serves two purposes: First, it tags the file as a Unicode document, and second, the order in which the two bytes appear indicate that the file is little-endian.

'00 48 00 65 00 6C 00 6C 00 6F
'This is the Unicode (big-endian) encoding with no BOM. Notepad does not support this encoding.

'FE FF 00 48 00 65 00 6C 00 6C 00 6F
'This is the Unicode (big-endian) encoding with BOM. Notice that this BOM is in the opposite order from the little-endian BOM.

'EF BB BF 48 65 6C 6C 6F
'This is UTF-8 encoding. The first three bytes are the UTF-8 encoding of the BOM.

'2B 2F 76 38 2D 48 65 6C 6C 6F
'This is UTF-7 encoding. The first five bytes are the UTF-7 encoding of the BOM. Notepad doesn't support this encoding.
'No BOM can only guess.
'
'
' Called by Directory Opus to initialize the script
Function OnInit(initData)
	initData.name = "Text Encode"
	initData.version = "1.0"
	initData.copyright = "(c) 2016 qiuqi"
	if DOpus.version.AtLeast("12.0.8") then
		initData.url = "https://resource.dopus.com/viewforum.php?f=35"
	End If
	initData.desc =""
	initData.default_enable = true
	initData.min_version = "12.0"
	initData.config.FileExtension = "*.asp;*.aspx;*.asax;*.ascx;*.ashx;*.bat;*.cmd;*.c;*.h;*.cs;*.cpp;*.hpp;*.cc;*.c++;" & _
	            "*.css;*.ini;*.inf;*.pas;*.dproj;*.bdsproj;*.dpr;*.dpk;*.dfm;*.fmx;*.nfm;*.xfm;*.lfm;*.e;*.groovy;" & _
	            "*.html;*.htm;*.shtml;*.hta;*.jsl;*.java;*.jav;*.jsp;*.js;*.jse;*.json;*.pl;*.pm;" & _
	            "*.plex;*.php;*.php4;*.phtml;*.ps1;*.py;*.pyw;*.rb;*.rbx;*.erb;*.resx;*.sql;*.tcl;*.txt;" & _
	            "*.vbs;*.frm;*.vb;*.bas;*.xml;*.dtd;*.xhtml;*.xsl;*.xslt;*.wpl;*.xsd;*.xs"
	Dim col

	Set col = initData.AddColumn
	col.name = "GetTextEncode"
	col.method ="OnGetTextEncode"
	col.label = "Encode"
	col.justify = "left"
	col.autogroup = False
	'col.nosort = true
End Function


' Implement the GetTextEncode column
Function OnGetTextEncode(scriptColData)
	Dim ExtStr, ExtSet
	ExtSet = LCase(Script.config.FileExtension)
	ExtStr = "*" & LCase(scriptColData.item.ext)
	if Not scriptColData.item.is_dir Then
		if (InStr(1, ExtSet, ExtStr) <> 0) and (Len(ExtStr) <> 0) Then
			if (scriptColData.item.size = 0) Then
			    scriptColData.value = "File Is Empty"
			ElseIf (scriptColData.item.size < 4) Then
			    scriptColData.value = "Unknow"
			Else
			    scriptColData.value = GetEncoding(scriptColData.item)
			End if
		End if
	End if
End Function

Function GetEncoding(FileName)
    Dim Files, FileSize, Data, Encoding, i, OutStr
    Set Files = DOpus.FSUtil.OpenFile(FileName)
    Set Encoding = DOpus.Create.Blob(0,0,0,0)
    If Files.Error = 0 Then
        Files.Seek 0, "b"
        Data = Files.Read(Encoding, 4)
        Files.Close
    End If
	if Data < 4 then
	    GetEncoding = "Unknow"
	    exit Function
	end if
'    for i = 0 to Encoding.Size - 1
'        OutStr = OutStr & " "  & Hex(Encoding(i))
'    Next
'    DOpus.Output FileName & ": " & Trim(OutStr)
    If (Encoding(0) = &HEF And Encoding(1) = &HBB And Encoding(2) = &HBF And Not Encoding(3) = &H00) Then
        GetEncoding = "UTF-8 BOM"
    ElseIf (Not Encoding(0) = &H00 And Encoding(1) = &H00 And Not Encoding(2) = &H00 And Encoding(3) = &H00) Then
        GetEncoding = "UTF16LE"
    ElseIf (Encoding(0) = &H00 And Not Encoding(1) = &H00 And Encoding(2) = &H00 And Not Encoding(3) = &H00) Then
        GetEncoding = "UTF16BE"
    ElseIf (Encoding(0) = &HFF And Encoding(1) = &HFE And Not Encoding(2) = &H00 And Encoding(3) = &H00) Then
        GetEncoding = "UTF16LE BOM"
    ElseIf (Encoding(0) = &HFE And Encoding(1) = &HFF And Encoding(2) = &H00 And Not Encoding(3) = &H00) Then
        GetEncoding = "UTF16BE BOM"
    ElseIf (Encoding(0) = &HFF And Encoding(1) = &HFE And Encoding(2) = &H00 And Encoding(3) = &H00) Then
        GetEncoding = "UTF32BE BOM"
    ElseIf (Encoding(0) = &H00 And Encoding(1) = &H00 And Encoding(2) = &HFE And Encoding(3) = &HFF) Then
        GetEncoding = "UTF32LE BOM"
    Else
        GetEncoding = "ANSI"
    End If
End Function

Thanks for posting this! o) It inspired me to add an "Encoding" column to an existing column set of mine (FileInfo).
I converted parts of your code to JScript and used your core logic to detect the file encoding. Very nice! o)
You are credited in the thread and code of course! o)

Smart-ass notes to your snippet:

  • col.label = "Encode" -> wouldn't "Encoding" be better here (also applies to the scripts internal name)?
  • Files.Seek 0, "b" -> not necessary?
  • initData.url -> this is meant to be set to "Column text file encoding if I'm not wrong

Not sure if returning "ANSI" is correct, I chose to keep it at "?" if there is no match, since you can't be sure if it's ASCII or ANSI with codepaging.
Not an expert on file encoding though, took me some years to finally understand some of the basics. o)

cya,
tbone

[quote="tbone"]Thanks for posting this! o) It inspired me to add an "Encoding" column to an existing column set of mine (FileInfo).
I converted parts of your code to JScript and used your core logic to detect the file encoding. Very nice! o)
You are credited in the thread and code of course! o)

Smart-ass notes to your snippet:

  • col.label = "Encode" -> wouldn't "Encoding" be better here (also applies to the scripts internal name)?
  • Files.Seek 0, "b" -> not necessary?
  • initData.url -> this is meant to be set to "Column text file encoding if I'm not wrong

Not sure if returning "ANSI" is correct, I chose to keep it at "?" if there is no match, since you can't be sure if it's ASCII or ANSI with codepaging.
Not an expert on file encoding though, took me some years to finally understand some of the basics. o)

cya,
tbone[/quote]
Hair is convenient for everyone, you are inspired me very happy.

You say "Hair"? Huhm.. o)

translation error