headerbanner

VBA - Excel - čtení vzdáleného souboru

Reading remote files in VBA with different encoding best to manage Enternet.Explorer object. The usual reading through MSXML2.XMLHTTP object can not cope with the national coding. If you can not use the (slow) IE object, you must first load remote file and save it. It is then possible to use ADODB.stream.

Test různých metod čtení vzdáleného obsahu a jak si VBA poradí s národními sadami znaků a různým kódováním.

Obvykle doporučované čtení pomocí MSXML2.XMLHTTP objektu si neporadí s windows-1250 ani s iso-8859-2. Nejspolehlivější je použití objektu Internet Exploreru, který využívá spolehlivé vlastní zjištění kódu. Vyžaduje to ale spuštění aplikace - byť neviditelné. Rychlejší může být download souboru na disk s následným čtením pomocí ADODB.Stream.

ADO knihovna nemá vlastní detekci kódu, nebo přinejmenším ve VBA nefunguje parametr _autodetect. Proto je třeba přečíst kódování z dokumentu, což je možné u XML a HTML dokumentů. U ostatních je nutné znát jejich kódování a poskytnout ho ADO objektu. Následující okomentovaný testovací kód vše objasňuje:

'http://www.progtown.com/topic1296585-vba-how-to-download-a-file-from-the-internet.html
Private Declare Function URLDownloadToFile Lib "urlmon" _
Alias "URLDownloadToFileA" (ByVal pCaller As Long, _
ByVal szURL As String, ByVal szFileName As String, _
ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
 
Const url = "http://www.newsroom.cz" 'URL of test server
 
'do you have set References (in Tools menu) for MSHTML.HTMLDocument and ADO?
 
 
Sub testIt()
'***********************
'Read remote content by InternetExplorer.Application object
'***********************
 
'read page charset windows-1250
MsgBox GetRemoteByIE(url & "/forvba-win1250.html", "cont") 'OK
 
'read page charset utf-8
MsgBox GetRemoteByIE(url & "/forvba-utf-8.html", "cont") 'OK
 
'read page charset iso-8859-2
MsgBox GetRemoteByIE(url & "/forvba-iso8859-2.html", "cont") 'OK
 
 
'WOW! It is the only reliable method without putting character encoding
'because it may use the encoding detection algorithms implemented in IE
 
GetRemoteByIE "", "", True 'close static IE object
 
'***********************
'Read remote content by MSXML2 object
'***********************
 
'read page charset windows-1250
MsgBox GetRemoteByMSXML2(url & "/forvba-win1250.html", "cont") 'bad
 
'read page charset utf-8
MsgBox GetRemoteByMSXML2(url & "/forvba-utf-8.html", "cont") 'OK
 
'read page charset iso-8859-2
MsgBox GetRemoteByMSXML2(url & "/forvba-iso8859-2.html", "cont") 'bad
 
 
 
'***********************
'Read remote content by Win API function URLDownloadToFileA
'and native Open file  (or FileSystemObject - read a comment in GetRemoteByDownload())
'***********************
 
'read page charset windows-1250
MsgBox GetRemoteByDownloadNoADO(url & "/forvba-win1250.html", "cont", "D:\f.txt")  'OK
 
'read page charset utf-8
MsgBox GetRemoteByDownloadNoADO(url & "/forvba-utf-8.html", "cont", "D:\f.txt") 'bad
 
'read page charset iso-8859-2
MsgBox GetRemoteByDownloadNoADO(url & "/forvba-iso8859-2.html", "cont", "D:\f.txt")  'bad
 
 
 
'***********************
'Read remote content by Win API function URLDownloadToFileA and ADODB.Stream with known encoding
'***********************
 
'read page charset windows-1250
MsgBox GetRemoteByDownloadADO(url & "/forvba-win1250.html", "cont", "D:\f.txt", "windows-1250") 'OK
 
'read page charset utf-8
MsgBox GetRemoteByDownloadADO(url & "/forvba-utf-8.html", "cont", "D:\f.txt", "utf-8") 'OK
 
'read page charset iso-8859-2
MsgBox GetRemoteByDownloadADO(url & "/forvba-iso8859-2.html", "cont", "D:\f.txt", "iso-8859-2") 'OK
 
 
 
'***********************
'Read remote content by Win API function URLDownloadToFileA and ADODB.Stream without known encoding
'***********************
 
'read page charset windows-1250
MsgBox GetRemoteByDownloadADO(url & "/forvba-win1250.html", "cont", "D:\f.txt", "autodetect") 'OK
 
'read page charset utf-8
MsgBox GetRemoteByDownloadADO(url & "/forvba-utf-8.html", "cont", "D:\f.txt", "autodetect") 'OK
 
'read page charset iso-8859-2
MsgBox GetRemoteByDownloadADO(url & "/forvba-iso8859-2.html", "cont", "D:\f.txt", "autodetect") 'OK
 
End Sub
 
 
 
 
 
Public Function GetRemoteByIE(url, id, Optional ByVal quit As Boolean = False) As String
'It is the only reliable reading method without putting character encoding
Static IE As Object
    If Not IE Is Nothing And quit Then
        IE.quit
        Set IE = Nothing
        Exit Function
    End If
    If IE Is Nothing Then Set IE = CreateObject("InternetExplorer.Application")
    IE.Visible = False
    tim = Timer 'for lazy response of remote server
    IE.navigate url
    Do While IE.Busy And Timer - tim < 5: DoEvents: Loop
    Do While IE.readyState <> 4 And Timer - tim < 5: DoEvents: Loop
    GetRemoteByIE = IE.document.getElementById(id).innerHTML 'you need not MSHTML.HTMLDocument object
 
End Function
 
Public Function GetRemoteByMSXML2(url, id) As String
'It is fast and it may to read asynchronously, but it incorrect for iso and windows character set
    Set xmlHttp = CreateObject("MSXML2.xmlHttp")
 
    'Next False parameter is for synchronous reading,
    'for asynchronous reading see http://dailydoseofexcel.com/archives/2006/10/09/async-xmlhttp-calls/
    xmlHttp.Open "GET", url, False
    xmlHttp.send
    s = Timer
    Do While xmlHttp.readyState <> 4 And Timer - s < 5: DoEvents: Loop
    If xmlHttp.Status = 200 Then
        Set doc = New MSHTML.HTMLDocument
        doc.body.innerHTML = xmlHttp.responseText
        GetRemoteByMSXML2 = doc.getElementById(id).innerHTML
    Else
        GetRemoteByMSXML2 = "No page url: " & url
    End If
 
    Set xmlHttp = Nothing
    Set doc = Nothing
End Function
 
Public Function GetRemoteByDownloadNoADO(url, id, tofile, Optional charset As Variant) As String
'do not use this reading method
    URLDownloadToFile 0, url, tofile, 0, 0
 
    Set doc = New MSHTML.HTMLDocument
 
 
'    'Only for test
'        'the first reading method with FileSystemObject /bad for utf-8 and iso
'        Dim FSO As FileSystemObject
'        Dim ts As TextStream
'        Set FSO = New FileSystemObject
'        Set ts = FSO.OpenTextFile(tofile,1, 0, False) 'false=not unicode
'        htmlstr = ts.ReadAll
 
    'the second reading method with native Open /bad for utf-8 and iso
    htmlstr = vbNullString
    Open tofile For Input As #1
    Do Until EOF(1)
    Line Input #1, textline
    htmlstr = htmlstr & textline
    Loop
    Close
    doc.body.innerHTML = htmlstr
    GetRemoteByDownloadNoADO = doc.getElementById(id).innerHTML
 
End Function
Public Function GetRemoteByDownloadADO(url, id, tofile, charset) As String
'use this reading method if you can't use reading by Internet Explorer object
    URLDownloadToFile 0, url, tofile, 0, 0
    Set doc = New MSHTML.HTMLDocument
    If charset = "autodetect" Then
        'reading method with ADODB.Stream without charset parameter
        'reliable method for HTML and XML documents with specifying the encoding
 
        charset = "utf-8" 'first reading assumes utf encoding
        htmlstr = vbNullString
        Set objStream = CreateObject("ADODB.Stream")
        objStream.charset = charset
        objStream.Open
        objStream.LoadFromFile (tofile)
        htmlstr = objStream.ReadText()
        'search charset meta tag in file first, then set charset value
        If InStr(1, htmlstr, "utf-8", vbTextCompare) = 0 Then 'if not utf-8
            'start special locale
            If InStr(1, htmlstr, "windows-1250", vbTextCompare) = 0 Then
                If InStr(1, htmlstr, "iso-8859-2", vbTextCompare) = 0 Then
                    MsgBox "I can't read known encoding"
                    Exit Function
                Else
                    charset = "iso-8859-2"
                End If
            Else
                charset = "windows-1250"
            End If
            'end special locale
            objStream.Close
            objStream.charset = charset
            objStream.Open 'second reading with right encoding parameter
            objStream.LoadFromFile (tofile)
            htmlstr = objStream.ReadText()
        End If
    Else
        'reading method with ADODB.Stream and submitted charset parameter
        'It is the reliable method for all remote files (csv, txt... without meta tag with encoding parameter)
        htmlstr = vbNullString
        Set objStream = CreateObject("ADODB.Stream")
        objStream.charset = charset
        objStream.Open
        objStream.LoadFromFile (tofile)
        htmlstr = objStream.ReadText()
    End If
    objStream.Close
    doc.body.innerHTML = htmlstr
    GetRemoteByDownloadADO = doc.getElementById(id).innerHTML
End Function
'Code of page for remote reading test:
'<html>
'  <head>
'  <meta http-equiv="content-type" content="text/html; charset=utf-8">
'  </head>
'  <body>
'    <div id="cont">utf-8:ìšèøžýáíéòïùú</div>
'  </body>
'</html>