class WebCrawlerBatch extends Thread//MK
{
String StartUrl="";
int Outlets_ID=0;
//StringBuffer pageBuffer=null;
StringBuilder pageBuffer=null;
BufferedReader reader=null;
int respCode = 0;
String line="";
//pageBuffer =
ArrayList linkList =null;
SBSearchEngine sbSearch=null;
//File destfile=new File("E:\\OUTPUT_LOG.txt");
public WebCrawlerBatch(String startUrl, int Outlet_ID)
{
this.startUrl=startUrl;
this.Outlet_ID=Outlet_ID;
}
public void run()//MK
{
actionSearch(startUrl,Outlet_ID);
}
private boolean crawling;
// Constructor for Search Web Crawler.
String startUrl;
int Outlet_ID;
private void actionSearch(String startUrl,int Outlet_ID)
{
if (crawling) {
crawling = false;
return;
}
int maxUrls = 1000000000; //maximum value of int =2,147,483,647
String searchString =" ";
startUrl = removeWwwFromUrl(startUrl);
//System.out.println("Free Memory in actionSearch module after WWWUrl modification " +Runtime.getRuntime().freeMemory());
search( startUrl, maxUrls, searchString,Outlet_ID);
}
public String check(String htmltext,String strStart,String strEnd,String url)
{
int maxLen=1000;
String searchdata = StringUtils.substringBetween(htmltext, strStart, strEnd);
if(!(searchdata==null))
{
if(searchdata.length()>maxLen)
searchdata=searchdata.substring(0,maxLen-1);
searchdata=searchdata.trim();
return searchdata; }
else{
return "NIL";
}
}
// Run the Search Crawler.
private void search( final String startUrl, final int maxUrls, final String searchString,final int Outlet_ID )
{
crawling = true;
crawl(startUrl, maxUrls, true,searchString, true,Outlet_ID);
crawling = false;
//System.out.println("Free Memory at the end of Search module" +Runtime.getRuntime().freeMemory());
}
// Verify URL format.
private URL verifyUrl(String url) {
// Only allow HTTP URLs.
if (!url.toLowerCase().startsWith("http://"))
return null;
// Verify format of URL.
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
private String downloadPage(URL pageUrl, int Outle_ID) {
try {
URLConnection uconn=pageUrl.openConnection();
Calendar c1=Calendar.getInstance();
c1.set(2010, 04, 23);
long c2 = c1.getTimeInMillis();
uconn.setIfModifiedSince(c2);
if(!(uconn instanceof HttpURLConnection))
{
throw new java.lang.IllegalArgumentException( "URL protocol must be HTTP." );
}
final java.net.HttpURLConnection conn1 =(java.net.HttpURLConnection)uconn;
respCode=conn1.getResponseCode();
conn1.setConnectTimeout(50000);
conn1.setReadTimeout(50000);
if(respCode != 404 && respCode != 403 && respCode!=400 && respCode!=504 && respCode !=502)
{
reader = new BufferedReader(new InputStreamReader(
conn1.getInputStream()));
pageBuffer=new StringBuilder();
while ((line = reader.readLine()) != null)
{
pageBuffer.append(line);
}
reader.close();
}
try
{
conn1.disconnect();
}
catch(Exception ee){}
return pageBuffer.toString();
}
catch(java.net.SocketTimeoutException timeout){System.out.println("Timed out");}
catch(java.io.IOException ioe){/*ioe.printStackTrace();//System.out.println("FileNotFoundException IS CLOSED HERE ");*/
}
catch (Exception e)
{
//e.printStackTrace();
}
pageBuffer=null;
return null;
}
// Remove leading "www" from a URL's host if present.
private String removeWwwFromUrl(String url) {
Pattern p=Pattern.compile("://www\\d*.",Pattern.CASE_INSENSITIVE);
Matcher m= p.matcher(url);
if(m.find())
{
String wwwcut = m.group().trim();
int c1=url.indexOf(wwwcut);
int c = wwwcut.lastIndexOf(".")+1;
return url.substring(0, c1+3)+url.substring(c1+c);
}
return (url);
}
private ArrayList retrieveLinks(
URL pageUrl, String pageContents, HashSet crawledList,
boolean limitHost,String pgstart,LinkedHashSet LandPage, LinkedHashSet finalCrawlList)
{
//pgstart=removeWwwFromUrl(pgstart);
pgstart= removeWwwFromUrl(pgstart);
// Compile link matching pattern.
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);
// Create list of link matches.
linkList = new ArrayList();
String link="";
String file="";
String path="";
int index ;
URL verifiedLink=null;
while (m.find()) {
link = m.group(1).trim();
// Skip empty links.
if (link.length() < 1) {
continue;
}
// Skip links that are just page anchors.
if (link.charAt(0) == '#') {
continue;
}
// Skip mailto links.
if (link.indexOf("mailto:") != -1) {
continue;
}
// Skip JavaScript links.
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}
// Prefix absolute and relative URLs if necessary.
if (link.indexOf("://") == -1) {
// Handle absolute URLs.
if (link.charAt(0) == '/') {
link = "http://" + pageUrl.getHost() + link;
// Handle relative URLs.
} else {
file = pageUrl.getFile();
if (file.indexOf('/') == -1) {
link = "http://" + pageUrl.getHost() + "/" + link;
} else {
path =
file.substring(0, file.lastIndexOf('/') + 1);
link = "http://" + pageUrl.getHost() + path + link;
}
}
}
// Remove anchors from link.
index = link.indexOf('#');
if (index != -1) {
link = link.substring(0, index);
}
// Remove leading "www" from URL's host if present.
link = removeWwwFromUrl(link);
// Verify link and skip if invalid.
verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}
/* If specified, limit links to those
having the same host as the start URL. */
if (limitHost &&
!pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase()))
{
continue;
}
// Skip link if it has already been crawled.
/*if (crawledList.contains(link)) {
continue;
}*/
if(link.charAt(link.length()-1)=='/')
{
link=link.substring(0, link.length()-1);
System.out.println("The link is made as"+link);
}
if(finalCrawlList.contains(link))
{
System.out.println("The link ignored becoz of finalCrawlList " +link);
continue;
}
if(LandPage.contains(link))
{
System.out.println("blocked with Landingpage check 1");
continue;
}
if(link.startsWith(pgstart))
{
System.out.println("The link COMING FOR ARTICLE CHECK IS"+link);
//if(!link.endsWith("/"))
if(link.matches("http://.*(STORY|ARTICLE|story|article|\\d{1,}).*"))
{
if(!linkList.contains(link))
linkList.add(link);
System.out.println("Link added 1");
}
else if(link.matches("http://.*(/.*){4,}.*"))
{
if(!linkList.contains(link))
linkList.add(link);
System.out.println("Link added 2");
}
}
//System.out.println("LINK avoided IS "+link);
//linkList.add(link);//MK comment this after test sept 27
}
return (linkList);
}
public void crawl(
String startUrl, int maxUrls, boolean limitHost,
String searchString, boolean caseSensitive,int Outlet_ID)
{
long before=new Date().getTime();
// System.out.println("Third print of the outlet" +Outlet_ID);
String sourceURL=startUrl;
int count=0;
int i=0;
int j=0;
// Set up crawl lists.
String strResult="";
HashSet crawledList = new HashSet();
LinkedHashSet toCrawlList = new LinkedHashSet();
LinkedHashSet finalCrawlList =new LinkedHashSet();
int k=0;
//strLandingPage is the Home Page
if(strLandingPage.charAt(strLandingPage.length()-1)=='/')
{
strLandingPage=strLandingPage.substring(0, strLandingPage.length()-1);
System.out.println("The LANDING PAGE is made as"+strLandingPage);
}
toCrawlList.add(strLandingPage);
lpcount=lpcount+1;
k=k+2;
}
while (crawling && toCrawlList.size() > 0)
{
/* Check to see if the max URL count has
been reached, if it was specified.*/
if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
break;
}
}
url = (String) toCrawlList.iterator().next();
toCrawlList.remove(url);
verifiedUrl = verifyUrl(url);
// Add page to the crawled list.
crawledList.add(url);
downloadedPgCount++;
try{
pageContents = downloadPage(verifiedUrl, Outlet_ID);
}
catch(java.lang.NullPointerException nexp){System.out.println("NULL POINTER EXCEPTION CAUGHT 1(Landing page)");}
if (pageContents != null && pageContents.length() > 0)
{
pagestart=arrayLP.get(j+1).toString();
if(pagestart.charAt(pagestart.length()-1)=='/')
{
pagestart=pagestart.substring(0, pagestart.length()-1);
System.out.println("The PAGe STARt is made as"+pagestart);
}
links =retrieveLinks(verifiedUrl, pageContents, crawledList,limitHost,pagestart,toCrawlList, finalCrawlList);
finalCrawlList.addAll(links);
}
//End of If page contents
landpgcount++;
j=j+2;
}
links.clear();
linkList.clear();
crawledList.clear();
toCrawlList.clear();
URL downloadURL=null;
Iterator itr = finalCrawlList.iterator();
System.out.println("With out doing the sql updates " + (Runtime.getRuntime().freeMemory()*100)/Runtime.getRuntime().totalMemory());
int h=0;
while(itr.hasNext())
{
try{
downloadURL = new URL(itr.next().toString());
itr.remove();
}
catch (java.net.MalformedURLException e){
e.printStackTrace();
System.out.println("Exceptioned url and outlet id are:" +downloadURL);
}
try
{
pageContents = downloadPage(downloadURL, Outlet_ID);
}
catch(java.lang.NullPointerException nex){System.out.println("NULL POINTER EXCEPTION CAUGHT 2(link)");}
pageBuffer=null;
reader.close();
//NOTE:SOME STRING OPERATIONS ARE DONE HERE TO RETRIEVE DATA FROM THE DOWNLOADED CONTENT
}
System.gc();
pagestart="";
pageContents="";
arrayLP.clear();
finalCrawlList.clear();
System.out.println("cleared all arraylists");
System.out.println("the percentage of free memory at the end is"+ (Runtime.getRuntime().freeMemory()*100)/Runtime.getRuntime().totalMemory());
System.out.println("At the End" +Outlet_ID);
} //End of try
catch(ClassNotFoundException c) {System.out.println("ClassNotFoundException");c.printStackTrace();}
catch(InstantiationException I){System.out.println("InstantiationException");}
catch(IllegalAccessException IA){System.out.println("IllegalAccessException");}
}
}