Parsing XHTML results from Bing
- by Nir
Hello, i am trying to parse received search queries from bing search engines which are received in xhtml in java. I am using sax XmlReader to read the results but i keep on getting errors.
here is my code-this one is for the hadler of the reader:
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class XHTMLHandler extends DefaultHandler{
public XHTMLHandler()
{
super();
}
public void startDocument ()
{
System.out.println("Start document");
}
public void endDocument ()
{
System.out.println("End document");
}
public void startElement (String uri, String name,String qName, Attributes atts)
{
if ("".equals (uri))
System.out.println("Start element: " + qName);
else
System.out.println("Start element: {" + uri + "}" + name);
}
public void endElement (String uri, String name, String qName)
{
if ("".equals (uri))
System.out.println("End element: " + qName);
else
System.out.println("End element: {" + uri + "}" + name);
}
public void startPrefixMapping (String prefix, String uri)
throws SAXException {
}
public void endPrefixMapping (String prefix)
throws SAXException {
}
public void characters (char ch[], int start, int length)
{
System.out.print("Characters: \"");
for (int i = start; i < start + length; i++) {
switch (ch[i]) {
case '\\':
System.out.print("\\\\");
break;
case '"':
System.out.print("\\\"");
break;
case '\n':
System.out.print("\\n");
break;
case '\r':
System.out.print("\\r");
break;
case '\t':
System.out.print("\\t");
break;
default:
System.out.print(ch[i]);
break;
}
}
System.out.print("\"\n");
}
}
and this is the program itself:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpRetryException;
import java.net.HttpURLConnection;
import java.net.URL;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
public class Searching {
private String m_urlBingSearch = "http://www.bing.com/search?q=";
private HttpURLConnection m_httpCon;
private OutputStreamWriter m_streamWriter;
//private BufferedReader m_bufferReader;
private URL m_serverAdress;
private StringBuilder sb;
private String m_line;
private InputSource m_inputSrc;
public Searching()
{
m_httpCon = null;
m_streamWriter = null;
//m_bufferReader = null;
m_serverAdress = null;
sb = null;
m_line = new String();
}
public void SearchBing(String searchPrms) throws SAXException,IOException
{
//set up connection
sb = new StringBuilder();
sb.append(m_urlBingSearch);
sb.append(searchPrms);
m_serverAdress = new URL(sb.toString());
m_httpCon = (HttpURLConnection)m_serverAdress.openConnection();
m_httpCon.setRequestMethod("GET");
m_httpCon.setDoOutput(true);
m_httpCon.setConnectTimeout(10000);
m_httpCon.connect();
//m_streamWriter = new OutputStreamWriter(m_httpCon.getOutputStream());
//m_bufferReader = new BufferedReader(new InputStreamReader(m_httpCon.getInputStream()));
XMLReader reader = XMLReaderFactory.createXMLReader();
XHTMLHandler handle = new XHTMLHandler();
reader.setContentHandler(handle);
reader.setErrorHandler(handle);
//reader.startPrefixMapping("html", "http://www.w3.org/1999/xhtml");
handle.startPrefixMapping("html", "http://www.w3.org/1999/xhtml");
m_inputSrc = new InputSource(m_httpCon.getInputStream());
reader.parse(m_inputSrc);
m_httpCon.disconnect();
}
public static void main(String [] args) throws SAXException,IOException
{
Searching s = new Searching();
s.SearchBing("beatles");
}
}
this is my error message:
Exception in thread "main" java.io.IOException: Server returned HTTP response code: 503 for URL: http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.setupCurrentEntity(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startEntity(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startDTDEntity(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.setInputSource(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.dispatch(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.next(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLNSDocumentScannerImpl.next(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)
at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source)
at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source)
at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(Unknown Source)
at com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.parse(Unknown Source)
at Searching.SearchBing(Searching.java:57)
at Searching.main(Searching.java:65)
can someone please help? i think it has something to do with dtd but i don't know hot to fix it