Asynchronous Webcrawling F#, something wrong ?
- by jlezard
Not quite sure if it is ok to do this but, my question is: Is there something wrong with my code ? It doesn't go as fast as I would like, and since I am using lots of async workflows maybe I am doing something wrong. The goal here is to build something that can crawl 20 000 pages in less than an hour.
open System
open System.Text
open System.Net
open System.IO
open System.Text.RegularExpressions
open System.Collections.Generic
open System.ComponentModel
open Microsoft.FSharp
open System.Threading
//This is the Parallel.Fs file
type ComparableUri ( uri: string ) =
inherit System.Uri( uri )
let elts (uri:System.Uri) =
uri.Scheme, uri.Host, uri.Port, uri.Segments
interface System.IComparable with
member this.CompareTo( uri2 ) =
compare (elts this) (elts(uri2 :?> ComparableUri))
override this.Equals(uri2) =
compare this (uri2 :?> ComparableUri ) = 0
override this.GetHashCode() = 0
///////////////////////////////////////////////Funtions to retreive html string//////////////////////////////
let mutable error = Set.empty<ComparableUri>
let mutable visited = Set.empty<ComparableUri>
let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri) =
async{
try
let req = (WebRequest.Create(uri)) :?> HttpWebRequest
// 'use' is equivalent to ‘using’ in C# for an IDisposable
req.UserAgent<-"Mozilla"
//Console.WriteLine("Waiting")
do! Async.Sleep(delay * 250)
let! resp = (req.AsyncGetResponse())
Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay)
use stream = resp.GetResponseStream()
use reader = new StreamReader(stream)
let html = reader.ReadToEnd()
return html
with
| _ as ex -> Console.WriteLine( ex.ToString() )
lock error (fun () -> error<- error.Add uri )
lock visited (fun () -> visited<-visited.Add uri )
return "BadUri"
}
///////////////////////////////////////////////Active Pattern Matching to retreive href//////////////////////////////
let (|Matches|_|) (pat:string) (inp:string) =
let m = Regex.Matches(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Count > 0
then Some (List.tail [ for g in m -> g.Value ])
else None
let (|Match|_|) (pat:string) (inp:string) =
let m = Regex.Match(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Success then
Some (List.tail [ for g in m.Groups -> g.Value ])
else
None
///////////////////////////////////////////////Find Bad href//////////////////////////////
let isEmail (link:string) =
link.Contains("@")
let isMailto (link:string) =
if Seq.length link >=6 then
link.[0..5] = "mailto"
else
false
let isJavascript (link:string) =
if Seq.length link >=10 then
link.[0..9] = "javascript"
else
false
let isBadUri (link:string) =
link="BadUri"
let isEmptyHttp (link:string) =
link="http://"
let isFile (link:string)=
if Seq.length link >=6 then
link.[0..5] = "file:/"
else
false
let containsPipe (link:string) =
link.Contains("|")
let isAdLink (link:string) =
if Seq.length link >=6 then
link.[0..5] = "adlink"
elif Seq.length link >=9 then
link.[0..8] = "http://adLink"
else
false
///////////////////////////////////////////////Find Bad href//////////////////////////////
let getHref (htmlString:string) =
let urlPat = "href=\"([^\"]+)"
match htmlString with
| Matches urlPat urls -> urls |> List.map( fun href -> match href with
| Match (urlPat) (link::[]) -> link
| _ -> failwith "The href was not in correct format, there was more than one match" )
| _ -> Console.WriteLine( "No links for this page" );[]
|> List.filter( fun link -> not(isEmail link) )
|> List.filter( fun link -> not(isMailto link) )
|> List.filter( fun link -> not(isJavascript link) )
|> List.filter( fun link -> not(isBadUri link) )
|> List.filter( fun link -> not(isEmptyHttp link) )
|> List.filter( fun link -> not(isFile link) )
|> List.filter( fun link -> not(containsPipe link) )
|> List.filter( fun link -> not(isAdLink link) )
let treatAjax (href:System.Uri) =
let link = href.ToString()
let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0]
new Uri(firstPart)
//only follow pages with certain extnsion or ones with no exensions
let followHref (href:System.Uri) =
let valid2 = set[".py"]
let valid3 = set[".php";".htm";".asp"]
let valid4 = set[".php3";".php4";".php5";".html";".aspx"]
let arrLength = href.Segments |> Array.length
let lastExtension = (href.Segments).[arrLength-1]
let lengthLastExtension = Seq.length lastExtension
if (lengthLastExtension <= 3) then
not( lastExtension.Contains(".") )
else
//test for the 2 case
let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)]
let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith( validEnd) )
if isValid2 then
true
else
if lengthLastExtension <= 4 then
not( last4.Contains(".") )
else
let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)]
let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith( validEnd) )
if isValid3 then
true
else
if lengthLastExtension <= 5 then
not( last5.Contains(".") )
else
let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)]
let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith( validEnd) )
if isValid4 then
true
else
not( last6.Contains(".") ) && not(lastExtension.[0..5] = "mailto")
//Create the correct links / -> add the homepage , make them a comparabel Uri
let hrefLinksToUri ( uri:ComparableUri ) (hrefLinks:string list) =
hrefLinks
|> List.map( fun link -> try
if Seq.length link <4 then
Some(new Uri( uri, link ))
else
if link.[0..3] = "http" then
Some(new Uri(link))
else
Some(new Uri( uri, link ))
with
| _ as ex -> Console.WriteLine(link);
lock error (fun () ->error<-error.Add uri)
None
)
|> List.filter( fun link -> link.IsSome )
|> List.map( fun o -> o.Value)
|> List.map( fun uri -> new ComparableUri( string uri ) )
//Treat uri , removing ajax last part , and only following links specified b Benoit
let linksToFollow (hrefUris:ComparableUri list) =
hrefUris
|>List.map( treatAjax )
|>List.filter( fun link -> followHref link )
|>List.map( fun uri -> new ComparableUri( string uri ) )
|>Set.ofList
let needToVisit uri =
( lock visited (fun () -> not( visited.Contains uri) ) ) && (lock error (fun () -> not( error.Contains uri) ))
let getLinksToFollowAsyncDelay (delay:int) ( uri: ComparableUri ) =
async{
let! links = getHtmlPrimitiveAsyncDelay delay uri
lock visited (fun () ->visited<-visited.Add uri)
let linksToFollow = getHref links
|> hrefLinksToUri uri
|> linksToFollow
|> Set.filter( needToVisit )
|> Set.map( fun link -> if uri.Authority=link.Authority then
link
else
link )
return linksToFollow
}
//Add delays if visitng same authority
let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,int>) =
let uriAuthority = uri.Authority
let hasAuthority,delay = authorityDelay.TryGetValue(uriAuthority)
if hasAuthority then
authorityDelay.[uriAuthority] <-delay+1
delay
else
authorityDelay.Add(uriAuthority,1)
0
let rec getLinksToFollowFromSetAsync maxIteration ( uris: seq<ComparableUri> ) =
let authorityDelay = Dictionary<string,int>()
if maxIteration = 100 then
Console.WriteLine("Finished")
else
//Unite by authority add delay for those we same authority others ignore
let stopwatch= System.Diagnostics.Stopwatch()
stopwatch.Start()
let newLinks = uris
|> Seq.map( fun uri -> let delay = lock authorityDelay (fun () -> getDelay uri authorityDelay )
getLinksToFollowAsyncDelay delay uri )
|> Async.Parallel
|> Async.RunSynchronously
|> Seq.concat
stopwatch.Stop()
Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n")
getLinksToFollowFromSetAsync (maxIteration+1) newLinks
getLinksToFollowFromSetAsync 0 (seq[ComparableUri( "http://twitter.com/" )])
Console.WriteLine("Finished")
Some feedBack would be great ! Thank you (note this is just something I am doing for fun)