Rem ----------------------------------------------------------------------------------------- Rem - WARNING Rem - CRAWLING A SITE CAN PUT A STRAIN ON THE HOST SERVER AND MAY BE PERCEIVED AS A Rem - DENIAL-OF-SERVICE ATTACK. YOU SHOULD ONLY CRAWL SITES FOR WHICH YOU HAVE BEEN GRANTED Rem - PERMISSION BY THE OWNER OF THE SITE. AS A COURTESY YOU SHOULD (1) IDENTIFY YOURSELF IN Rem - THE "USER-AGENT" AND/OR "FROM" HTTP HEADER, (2) INCLUDE A "REFERER" HEADER (3) TRY TO Rem - PRESERVE SESSION STATE THROUGH COOKIES SO THAT NEW SESSIONS ARE NOT CREATED FOR EACH Rem - PAGE REQUEST, (4) INCLUDE A PAUSE IN THE CRAWL TO SLOW THINGS DOWN, AND (5) USE A LOW Rem - NUMBER OF THREADS. Rem - Rem - THIS SCRIPT IS PROVIDED AS-IS WITHOUT ANY WARRANTIES AND IS STRICTLY FOR DEMONSTRATING Rem - SCRIPTQ FEATURES. Rem ----------------------------------------------------------------------------------------- Rem - Crawl web site for broken links. Each script processes a single URL and creates a job Rem - for each link on the page. Log file will contain broken link URL, referrer URL and link Rem - text. Rem - Rem - Arg1 contains the URL to crawl. Rem - Arg2 contains the referrer URL. Rem - Arg3 contains the link text. Rem ----------------------------------------------------------------------------------------- Rem ----------------------------------------------------------------------------------------- Rem - Declare Rem ----------------------------------------------------------------------------------------- Dim objHTTP, objDoc, objNode, objCookies, objCookie, strURL, strBaseURL, strNewURL, strReferrer, _ strLinkText, strCookie, strCookieXML, varExclude, varItem, _ boolExclude, strTemp, i Rem ----------------------------------------------------------------------------------------- Rem - **** START - CHANGE ME **** Rem ----------------------------------------------------------------------------------------- Const strDomain = "localhost" 'Just the domain to crawl without a protocol or path. Const strPath = "/" 'The path appended to the domain. Const strUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) ScriptQ Broken Link Checker" Const strYourEmailAddress = "" 'Your email address can be used to contact you in case your crawl is causing problems. 'List URLs to exclude. If the following is part of a URL then the URL will be excluded. 'Customize this list to suit your own site. varExclude = Array("validator.w3.org", "printer-friendly.asp", "xml.asp", "css=yes", "css=no", "jump=") Rem ----------------------------------------------------------------------------------------- Rem - **** END - CHANGE ME **** Rem ----------------------------------------------------------------------------------------- Rem ----------------------------------------------------------------------------------------- Rem - Check URL to crawl exists. Rem ----------------------------------------------------------------------------------------- If UBound(ScriptQ.JobArguments) <> 2 Then strURL = "http://" & strDomain & strPath ScriptQ.SharedRemoveAll Else strURL = ScriptQ.JobArguments()(0) strReferrer = ScriptQ.JobArguments()(1) strLinkText = ScriptQ.JobArguments()(2) End If Rem ----------------------------------------------------------------------------------------- Rem - Check URL is for HTTP Rem ----------------------------------------------------------------------------------------- If Left(LCase(strURL), 7) <> "http://" Then ScriptQ.Echo "Skipping URL: " & strURL ScriptQ.Quit Else If Len(strURL) <= 7 Then ScriptQ.Echo "Skipping URL: " & strURL ScriptQ.Quit End If End If Rem ----------------------------------------------------------------------------------------- Rem - Check URL fragment Rem ----------------------------------------------------------------------------------------- i = InStr(1, strURL, "#") If i > 0 Then strURL = Left(strURL, i - 1) End If i = InStr(1, strReferrer, "#") If i > 0 Then strReferrer = Left(strReferrer, i - 1) End If Rem ----------------------------------------------------------------------------------------- Rem - Add path to domain Rem ----------------------------------------------------------------------------------------- If InStrRev(strURL, "/") = 7 Then strURL = strURL & "/" End If Rem ----------------------------------------------------------------------------------------- Rem - Check URL has not been already crawled. Rem ----------------------------------------------------------------------------------------- If ScriptQ.SharedExists(LCase(strURL)) Then If ScriptQ.SharedItem(LCase(strURL)) <> 200 Then ScriptQ.Echo ScriptQ.SharedItem(LCase(strURL)) & vbTab & strURL & vbTab & strReferrer & vbTab & strLinkText End If ScriptQ.Quit End If Rem ----------------------------------------------------------------------------------------- Rem - Initialize Rem ----------------------------------------------------------------------------------------- Set objHTTP = CreateObject("XStandard.HTTP") objHTTP.AbsoluteTimeout = True objHTTP.MaxRedirects = 3 If ScriptQ.JobTimeout >= 10000 Then objHTTP.TimeOut = ScriptQ.JobTimeout - 2000 Else objHTTP.Timeout = 30000 End If objHTTP.AddRequestHeader "Accept", "text/xml,application/xml,application/xhtml+xml,text/html,*/*" objHTTP.AddRequestHeader "Accept-Language", "*" objHTTP.AddRequestHeader "User-Agent", strUserAgent If Len(strYourEmailAddress) > 0 Then objHTTP.AddRequestHeader "From", strYourEmailAddress End If If Len(strReferrer) > 0 Then objHTTP.AddRequestHeader "Referer", strReferrer End If Set objDoc = CreateObject("MSXML2.DOMDocument.4.0") objDoc.Async = False Set objCookies = CreateObject("MSXML2.DOMDocument.4.0") objCookies.Async = False Rem ----------------------------------------------------------------------------------------- Rem - Pause a bit to slow things down. You may need to increase this value. Don't forget to Rem - increase the timeout of the job and HTTP request too. Rem ----------------------------------------------------------------------------------------- ScriptQ.Sleep 1000 Rem ----------------------------------------------------------------------------------------- Rem - Add cookies to request Rem ----------------------------------------------------------------------------------------- If objCookies.LoadXML("" & ScriptQ.SharedItem("COOKIE") & "") Then For Each objCookie In objCookies.SelectNodes("/state/cookie") Rem ----------------------------------------------------------------------------------------- Rem - Check domain Rem ----------------------------------------------------------------------------------------- If InStrRev(strDomain, objCookie.SelectSingleNode("domain").Text) > 0 Then Rem ----------------------------------------------------------------------------------------- Rem - Check path Rem ----------------------------------------------------------------------------------------- If InStr(1, LCase(strURL), "http://" & strDomain & objCookie.SelectSingleNode("path").Text) = 1 Then If Len(strCookie) = 0 Then strCookie = objCookie.SelectSingleNode("name").Text & "=" & objCookie.SelectSingleNode("value").Text Else If InStr(1, strCookie, objCookie.SelectSingleNode("name").Text & "=") = 0 Then strCookie = strCookie & ";" & objCookie.SelectSingleNode("name").Text & "=" & objCookie.SelectSingleNode("value").Text End If End If End If End If Next End If If Len(strCookie) > 0 Then objHTTP.AddRequestHeader "Cookie", strCookie End If Rem ----------------------------------------------------------------------------------------- Rem - Get page Rem ----------------------------------------------------------------------------------------- objHTTP.Get strURL Rem ----------------------------------------------------------------------------------------- Rem - Add cookies to shared memory for the current domain only Rem ----------------------------------------------------------------------------------------- strTemp = LCase(Mid(objHTTP.URL, 8)) i = InStr(1, strTemp, "/") If i > 1 Then strTemp = Mid(strTemp, 1, i - 1) End If If strDomain = strTemp Then If objCookies.LoadXML(objHTTP.ResponseCookiesAsXML) Then If objCookies.SelectNodes("/state/cookie").Length > 0 Then ScriptQ.SharedLock For Each objCookie In objCookies.SelectNodes("/state/cookie") If InStr(1, ScriptQ.SharedItem("COOKIE"), objCookie.SelectSingleNode("name").XML) = 0 Then ScriptQ.SharedItem("COOKIE") = ScriptQ.SharedItem("COOKIE") & objCookie.XML End If Next ScriptQ.SharedUnlock End If End If End If Rem ----------------------------------------------------------------------------------------- Rem - Mark as processed. Capture the original URL and URL changed by any redirects. Rem ----------------------------------------------------------------------------------------- ScriptQ.SharedAdd LCase(strURL), objHTTP.ResponseCode ScriptQ.SharedAdd LCase(objHTTP.URL), objHTTP.ResponseCode Rem ----------------------------------------------------------------------------------------- Rem - Check response code Rem ----------------------------------------------------------------------------------------- If objHTTP.ResponseCode = 200 Then ScriptQ.Echo objHTTP.ResponseCode & vbTab & strURL & vbTab & vbTab Else ScriptQ.Echo objHTTP.ResponseCode & vbTab & strURL & vbTab & strReferrer & vbTab & strLinkText Set objHTTP = Nothing Set objDoc = Nothing Set objNode = Nothing Set objCookies = Nothing Set objCookie = Nothing ScriptQ.Quit End If Rem ----------------------------------------------------------------------------------------- Rem - Load the cleaned version of the HTML into an XML parser. Rem ----------------------------------------------------------------------------------------- If Not objDoc.LoadXML(objHTTP.ResponseAsXML) Then ScriptQ.Echo "Cannot process data from: " & strURL Set objHTTP = Nothing Set objDoc = Nothing Set objNode = Nothing Set objCookies = Nothing Set objCookie = Nothing ScriptQ.Quit End If Rem ----------------------------------------------------------------------------------------- Rem - Get the base URL Rem ----------------------------------------------------------------------------------------- Set objNode = objDoc.SelectSingleNode("/html/head/base[string(@href) != '']") If objNode Is Nothing Then strBaseURL = objHTTP.URL Else strBaseURL = objNode.Attributes.GetNamedItem("href").Text End If Rem ----------------------------------------------------------------------------------------- Rem - Process page links only if this URL is part of the domain to crawl. Rem ----------------------------------------------------------------------------------------- If InStr(1, objHTTP.URL, "http://" & strDomain, vbTextCompare) <> 1 Then Set objHTTP = Nothing Set objDoc = Nothing Set objNode = Nothing Set objCookies = Nothing Set objCookie = Nothing ScriptQ.Quit Else For Each objNode In objDoc.SelectNodes("//a[string(@href) != '']") boolExclude = False Rem ----------------------------------------------------------------------------------------- Rem - Get link text Rem ----------------------------------------------------------------------------------------- If Len(objNode.Text) = 0 Then If objNode.SelectNodes(".//img[string(@alt) != '']").Length > 0 Then strTemp = "[image: " & objNode.SelectSingleNode(".//img").Attributes.GetNamedItem("alt").Text & "]" ElseIf objNode.SelectNodes(".//img[string(@title) != '']").Length > 0 Then strTemp = "[image: " & objNode.SelectSingleNode(".//img").Attributes.GetNamedItem("title").Text & "]" ElseIf objNode.SelectNodes(".//img").Length > 0 Then strTemp = "[image]" Else strTemp = "[n/a]" End If Else strTemp = objNode.Text End If strNewURL = Trim(objHTTP.ResolveRelativeURL(strBaseURL, objNode.Attributes.GetNamedItem("href").Text)) Rem ----------------------------------------------------------------------------------------- Rem - Check URL fragment Rem ----------------------------------------------------------------------------------------- i = InStr(1, strNewURL, "#") If i > 0 Then strNewURL = Left(strNewURL, i - 1) End If Rem ----------------------------------------------------------------------------------------- Rem - Check URL for exclusion Rem ----------------------------------------------------------------------------------------- If Not boolExclude Then For Each varItem In varExclude If InStr(LCase(strNewURL), varItem) > 0 Then boolExclude = True Exit For End If Next End If Rem ----------------------------------------------------------------------------------------- Rem - Check for protocol Rem ----------------------------------------------------------------------------------------- If Not boolExclude Then If Left(LCase(strNewURL), 7) <> "http://" Then ScriptQ.Echo "Skipping URL: " & strNewURL boolExclude = True Else If Len(strNewURL) <= 7 Then ScriptQ.Echo "Skipping URL: " & strNewURL boolExclude = True End If End If End If Rem ----------------------------------------------------------------------------------------- Rem - Check URL has not been already crawled. Rem ----------------------------------------------------------------------------------------- If Not boolExclude Then If ScriptQ.SharedExists(LCase(strNewURL)) Then If ScriptQ.SharedItem(LCase(strNewURL)) <> 200 Then ScriptQ.Echo ScriptQ.SharedItem(LCase(strNewURL)) & vbTab & strNewURL & vbTab & strURL & vbTab & strTemp End If Else ScriptQ.Add ScriptQ.JobScript, True, ScriptQ.JobLanguage, strNewURL, ScriptQ.JobPriority, ScriptQ.JobGroupID, ScriptQ.JobRetryDelay, ScriptQ.JobTimeout, ScriptQ.JobRetryCount, , Array(strNewURL, strURL, strTemp) End If End If Next End If Rem ----------------------------------------------------------------------------------------- Rem - Clean up Rem ----------------------------------------------------------------------------------------- Set objHTTP = Nothing Set objDoc = Nothing Set objNode = Nothing Set objCookies = Nothing Set objCookie = Nothing