HTTP crawler in Erlang

Posted by ctp on Stack Overflow See other posts from Stack Overflow or by ctp
Published on 2012-09-11T11:04:49Z Indexed on 2012/09/13 21:38 UTC
Read the original article Hit count: 349

Filed under:
|
|

I'm coding on a simple HTTP crawler but I have an issue running the code at the bottom. I'm requesting 50 URLs and get the content of 20+ back. I've generated few files with 150kB size each to test the crawler. So I think the 20+ responses are limited by the bandwidth? BUT: how to tell the Erlang snippet not to quit until the last file is not fetched? The test data server is online, so plz try the code out and any hints are welcome :)

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/1]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  lists:foreach(fun do_spawn/1, Ids).

do_spawn(Id) ->
  proc_lib:spawn_link(?MODULE, do_send_req, [Id]).

do_send_req(Id) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
{ok, Status, _H, B} ->
  io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]);
Err ->
  io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err])
  end.

That's the output:

Requesting ID 1 ... 
Requesting ID 2 ... 
Requesting ID 3 ... 
Requesting ID 4 ... 
Requesting ID 5 ... 
Requesting ID 6 ... 
Requesting ID 7 ... 
Requesting ID 8 ... 
Requesting ID 9 ... 
Requesting ID 10 ... 
Requesting ID 11 ... 
Requesting ID 12 ... 
Requesting ID 13 ... 
Requesting ID 14 ... 
Requesting ID 15 ... 
Requesting ID 16 ... 
Requesting ID 17 ... 
Requesting ID 18 ... 
Requesting ID 19 ... 
Requesting ID 20 ... 
Requesting ID 21 ... 
Requesting ID 22 ... 
Requesting ID 23 ... 
Requesting ID 24 ... 
Requesting ID 25 ... 
Requesting ID 26 ... 
Requesting ID 27 ... 
Requesting ID 28 ... 
Requesting ID 29 ... 
Requesting ID 30 ... 
Requesting ID 31 ... 
Requesting ID 32 ... 
Requesting ID 33 ... 
Requesting ID 34 ... 
Requesting ID 35 ... 
Requesting ID 36 ... 
Requesting ID 37 ... 
Requesting ID 38 ... 
Requesting ID 39 ... 
Requesting ID 40 ... 
Requesting ID 41 ... 
Requesting ID 42 ... 
Requesting ID 43 ... 
Requesting ID 44 ... 
Requesting ID 45 ... 
Requesting ID 46 ... 
Requesting ID 47 ... 
Requesting ID 48 ... 
Requesting ID 49 ... 
Requesting ID 50 ... 
OK -- ID: 49 -- Status: "200" -- Content length: 150000
OK -- ID: 47 -- Status: "200" -- Content length: 150000
OK -- ID: 50 -- Status: "200" -- Content length: 150000
OK -- ID: 17 -- Status: "200" -- Content length: 150000
OK -- ID: 48 -- Status: "200" -- Content length: 150000
OK -- ID: 45 -- Status: "200" -- Content length: 150000
OK -- ID: 46 -- Status: "200" -- Content length: 150000
OK -- ID: 10 -- Status: "200" -- Content length: 150000
OK -- ID: 09 -- Status: "200" -- Content length: 150000
OK -- ID: 19 -- Status: "200" -- Content length: 150000
OK -- ID: 13 -- Status: "200" -- Content length: 150000
OK -- ID: 21 -- Status: "200" -- Content length: 150000
OK -- ID: 16 -- Status: "200" -- Content length: 150000
OK -- ID: 27 -- Status: "200" -- Content length: 150000
OK -- ID: 03 -- Status: "200" -- Content length: 150000
OK -- ID: 23 -- Status: "200" -- Content length: 150000
OK -- ID: 29 -- Status: "200" -- Content length: 150000
OK -- ID: 14 -- Status: "200" -- Content length: 150000
OK -- ID: 18 -- Status: "200" -- Content length: 150000
OK -- ID: 01 -- Status: "200" -- Content length: 150000
OK -- ID: 30 -- Status: "200" -- Content length: 150000
OK -- ID: 40 -- Status: "200" -- Content length: 150000
OK -- ID: 05 -- Status: "200" -- Content length: 150000

Update:

thanks stemm for the hint with the wait_workers. I've combined your and mine code but same behaviour :(

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/2]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
{Ref, done} ->
    done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
  {ok, Status, _H, B} ->
    io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
    %% send message that work is done
    Pid ! {Ref, done};
  Err ->
    io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
    %% repeat request if there was error while fetching a page, 
    do_send_req(Id, {Pid, Ref})
    %% or - if you don't want to repeat request, put there:
    %% Pid ! {Ref, done}
end.

Running the crawler forks fine for a handful of files, but then the code even doesnt fetch the entire files (file size each 150000 bytes) - he crawler fetches some files partially, see the following web server log :(

82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /10 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /1 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /3 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /8 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /39 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /7 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /6 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /2 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /5 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /50 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /9 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /44 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /38 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /47 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /49 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /43 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /37 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /46 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /48 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /36 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /42 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /41 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /45 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /17 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /35 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /16 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /15 HTTP/1.1" 200 17020 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /21 HTTP/1.1" 200 120360 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /40 HTTP/1.1" 200 117600 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /34 HTTP/1.1" 200 60660 "-" "-"

Any hints are welcome. I have no clue what's going wrong there :(

© Stack Overflow or respective owner

Related posts about http

Related posts about erlang