We're experiencing a strange problem with our current Varnish configuration.
4x Web Servers (IIS 6.5 on Windows 2003 Server, each installed on a Intel(R) Xeon(R) CPU E5450 @ 3.00GHz Quad Core, 4GB RAM)
3x Varnish Servers (varnish-3.0.3 revision 9e6a70f on Ubuntu 12.04.2 LTS - 64 bit/precise, Kernel Linux 3.2.0-29-generic, each installed on a Intel(R) Xeon(R) CPU E5450 @ 3.00GHz Quad Core, 4GB RAM)
The Varnish Servers performance are awfully bad in general, to the point that if we shut down one of them the other two are unable to fullfill all the requests and start to skip beats resulting in pending requests, timeouts, 404, etc.
What can we do to improve our Varnish performance? Considering that we're getting less than 5k request per seconds during our max peak, we should be able to serve our pages even with a single one of them without any problem.
We use a standard, vanilla CFG, as shown by this varnishadm param.show output:
acceptor_sleep_decay 0.900000 []
acceptor_sleep_incr 0.001000 [s]
acceptor_sleep_max 0.050000 [s]
auto_restart on [bool]
ban_dups on [bool]
ban_lurker_sleep 0.010000 [s]
between_bytes_timeout 60.000000 [s]
cc_command "exec gcc -std=gnu99 -g -O2 -pthread -fpic -shared - Wl,-x -o %o %s"
cli_buffer 8192 [bytes]
cli_timeout 20 [seconds]
clock_skew 10 [s]
connect_timeout 0.700000 [s]
critbit_cooloff 180.000000 [s]
default_grace 10.000000 [seconds]
default_keep 0.000000 [seconds]
default_ttl 120.000000 [seconds]
diag_bitmap 0x0 [bitmap]
esi_syntax 0 [bitmap]
expiry_sleep 1.000000 [seconds]
fetch_chunksize 128 [kilobytes]
fetch_maxchunksize 262144 [kilobytes]
first_byte_timeout 60.000000 [s]
group varnish (113)
gzip_level 6 []
gzip_memlevel 8 []
gzip_stack_buffer 32768 [Bytes]
gzip_tmp_space 0 []
gzip_window 15 []
http_gzip_support off [bool]
http_max_hdr 64 [header lines]
http_range_support on [bool]
http_req_hdr_len 8192 [bytes]
http_req_size 32768 [bytes]
http_resp_hdr_len 8192 [bytes]
http_resp_size 32768 [bytes]
idle_send_timeout 60 [seconds]
listen_address :80
listen_depth 1024 [connections]
log_hashstring on [bool]
log_local_address off [bool]
lru_interval 2 [seconds]
max_esi_depth 5 [levels]
max_restarts 4 [restarts]
nuke_limit 50 [allocations]
pcre_match_limit 10000 []
pcre_match_limit_recursion 10000 []
ping_interval 3 [seconds]
pipe_timeout 60 [seconds]
prefer_ipv6 off [bool]
queue_max 100 [%]
rush_exponent 3 [requests per request]
saintmode_threshold 10 [objects]
send_timeout 600 [seconds]
sess_timeout 5 [seconds]
sess_workspace 16384 [bytes]
session_linger 50 [ms]
session_max 100000 [sessions]
shm_reclen 255 [bytes]
shm_workspace 8192 [bytes]
shortlived 10.000000 [s]
syslog_cli_traffic on [bool]
thread_pool_add_delay 2 [milliseconds]
thread_pool_add_threshold 2 [requests]
thread_pool_fail_delay 200 [milliseconds]
thread_pool_max 2000 [threads]
thread_pool_min 5 [threads]
thread_pool_purge_delay 1000 [milliseconds]
thread_pool_stack unlimited [bytes]
thread_pool_timeout 300 [seconds]
thread_pool_workspace 65536 [bytes]
thread_pools 2 [pools]
thread_stats_rate 10 [requests]
user varnish (106)
vcc_err_unref on [bool]
vcl_dir /etc/varnish
vcl_trace off [bool]
vmod_dir /usr/lib/varnish/vmods
waiter default (epoll, poll)
This is our default.vcl file: LINK
sub vcl_recv {
# BASIC recv COMMANDS:
#
# lookup -> search the item in the cache
# pass -> always serve a fresh item (no-caching)
# pipe -> like pass but ensures a direct-connection with the backend (no-cache AND no-proxy)
# Allow the backend to serve up stale content if it is responding slow.
# This defines when Varnish should use a stale object if it has one in the cache.
set req.grace = 30s;
if (client.ip == "127.0.0.1") {
# request from NGINX - do not alter X-Forwarded-For
set req.http.HTTPS = "on";
}
else {
# Add an X-Forwarded-For to keep track of original request
unset req.http.HTTPS;
unset req.http.X-Forwarded-For;
set req.http.X-Forwarded-For = client.ip;
}
set req.backend = www_director;
# Strip all cookies to force an anonymous request when the back-end servers are down.
if (!req.backend.healthy) {
unset req.http.Cookie;
}
## HHTP Accept-Encoding
if (req.http.Accept-Encoding) {
if (req.http.Accept-Encoding ~ "gzip") {
set req.http.Accept-Encoding = "gzip";
}
else if (req.http.Accept-Encoding ~ "deflate") {
set req.http.Accept-Encoding = "deflate";
}
else {
unset req.http.Accept-Encoding;
}
}
if (req.request != "GET" &&
req.request != "HEAD" &&
req.request != "PUT" &&
req.request != "POST" &&
req.request != "TRACE" &&
req.request != "OPTIONS" &&
req.request != "DELETE") {
/* non-RFC2616 or CONNECT */
return (pipe);
}
if (req.request != "GET" && req.request != "HEAD") {
/* only deal with GET and HEAD by default */
return (pass);
}
if (req.http.Authorization) {
return (pass);
}
if (req.http.HTTPS ~ "on") {
return (pass);
}
######################################################
# COOKIE HANDLING
######################################################
# METHOD 1: do not remove cookies, but pass the page if they contain TB_NC
if (!(req.url ~ "(?i)\.(png|gif|ipeg|jpg|ico|swf|css|js)(\?[a-z0-9]+)?$")) {
if (req.http.Cookie && req.http.Cookie ~ "TB_NC") {
return (pass);
}
}
return (lookup);
}
# Code determining what to do when serving items from the IIS Server
sub vcl_fetch {
unset beresp.http.Server;
set beresp.http.Server = "Server-1";
# Allow items to be stale if needed. This is the maximum time Varnish should keep an object.
set beresp.grace = 1h;
if (req.url ~ "(?i)\.(png|gif|ipeg|jpg|ico|swf|css|js)(\?[a-z0-9]+)?$") {
unset beresp.http.set-cookie;
}
# Default Varnish VCL logic
if (!beresp.cacheable ||
beresp.ttl <= 0s ||
beresp.http.Set-Cookie ||
beresp.http.Vary == "*") {
set beresp.ttl = 120 s;
return(hit_for_pass);
}
# Not Cacheable if it has specific TB_NC no-caching cookie
if (req.http.Cookie && req.http.Cookie ~ "TB_NC") {
set beresp.http.X-Cacheable = "NO:Got Cookie";
set beresp.ttl = 120 s;
return(hit_for_pass);
}
# Not Cacheable if it has Cache-Control private
else if (beresp.http.Cache-Control ~ "private") {
set beresp.http.X-Cacheable = "NO:Cache-Control=private";
set beresp.ttl = 120 s;
return(hit_for_pass);
}
# Not Cacheable if it has Cache-Control no-cache or Pragma no-cache
else if (beresp.http.Cache-Control ~ "no-cache" || beresp.http.Pragma ~ "no-cache") {
set beresp.http.X-Cacheable = "NO:Cache-Control=no-cache (or pragma no-cache)";
set beresp.ttl = 120 s;
return(hit_for_pass);
}
# If we reach to this point, the object is cacheable.
# Cacheable but with not enough ttl: we need to extend the lifetime of the object artificially
# NOTE: Varnish default TTL is set in /etc/sysconfig/varnish
# and can be checked using the following command:
# varnishadm param.show default_ttl
else if (beresp.ttl < 1s) {
set beresp.ttl = 5s;
set beresp.grace = 5s;
set beresp.http.X-Cacheable = "YES:FORCED";
}
# Cacheable and with valid TTL.
else {
set beresp.http.X-Cacheable = "YES";
}
# DEBUG INFO (Cookies)
# set beresp.http.X-Cookie-Debug = "Request cookie: " + req.http.Cookie;
return(deliver);
}
sub vcl_error {
set obj.http.Content-Type = "text/html; charset=utf-8";
if (obj.status == 404) {
synthetic {"
<!-- Markup for the 404 page goes here -->
"};
}
else if (obj.status == 500) {
synthetic {"
<!-- Markup for the 500 page goes here -->
"};
}
else if (obj.status == 503) {
if (req.restarts < 4) { return(restart); }
else {
synthetic {"
<!-- Markup for the 503 page goes here -->
"};
}
}
else {
synthetic {"
<!-- Markup for a generic error page goes here -->
"};
}
}
sub vcl_deliver {
if (obj.hits > 0) {
set resp.http.X-Cache = "HIT";
} else {
set resp.http.X-Cache = "MISS";
}
}
Thanks in advance,