Another SO webscraping question, this time a bounty. The given web page is seriously unfriendly and won't be scripted giving Nulls and crashing. In the end I resorted to writing the logic as a JavaScript program running on the ScriptEngine.
Option Explicit
'*Tools->References
'* Microsoft Scripting Runtime
'* Microsoft Scripting Control
'* Microsoft Internet Controls
'* Microsoft HTML Object Library
Sub Torrent_Data()
Dim row As Long
Dim IE As New InternetExplorer, html As HTMLDocument
Dim post As Object
With IE
.Visible = True
.navigate "https://yts.am/browse-movies"
Do While .readyState <> READYSTATE_COMPLETE:
DoEvents
Loop
Set html = .document
End With
Dim dicFilms As Scripting.Dictionary
Set dicFilms = New Scripting.Dictionary
Call GetScriptEngine.Run("getMovies", html, dicFilms)
Dim vFilms As Variant
vFilms = dicFilms.Keys
Dim vYears As Variant
vYears = dicFilms.Items
Dim lRowLoop As Long
For lRowLoop = 0 To dicFilms.Count - 1
Cells(lRowLoop + 1, 1) = vFilms(lRowLoop)
Cells(lRowLoop + 1, 2) = vYears(lRowLoop)
Next lRowLoop
Stop
IE.Quit
End Sub
Private Function GetScriptEngine() As ScriptControl
'* see code from this SO Q & A
' https://stackoverflow.com/questions/37711073/in-excel-vba-on-windows-how-to-get-stringified-json-respresentation-instead-of
Static soScriptEngine As ScriptControl
If soScriptEngine Is Nothing Then
Set soScriptEngine = New ScriptControl
soScriptEngine.Language = "JScript"
soScriptEngine.AddCode "function getMovies(htmlDocument, microsoftDict) { " & _
"var titles = htmlDocument.querySelectorAll('a.browse-movie-title'), i;" & _
"var years = htmlDocument.querySelectorAll('div.browse-movie-year'), j;" & _
"if ( years.length === years.length) {" & _
"for (i=0; i< years.length; ++i) {" & _
" var film = titles[i].innerText;" & _
" var year = years[i].innerText;" & _
" microsoftDict.Add(film, year);" & _
"}}}"
End If
Set GetScriptEngine = soScriptEngine
End Function
No comments:
Post a Comment