well, i have one, taken care of that memory problem, at least with the "logs", by clearing each string out before it returns the data... but just to resolve any discrepancies... im posting the full code... its only four classes... shouldnt take much... feel free to look and scavenge thru it... heck, i didnt even make the crawler method, i copied it from something i found off of suns website, then built the rest of the program around it... lol.
import java.awt.event.ActionListener;
import java.awt.event.ActionEvent;
import javax.swing.text.DefaultCaret;
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JButton;
import javax.swing.JLabel;
import java.awt.Dimension;
import java.awt.Toolkit;
import java.util.ArrayList;
import java.io.PrintWriter;
import java.io.FileOutputStream;
public class WebCrawler implements ActionListener
{
JFrame frame;
JPanel panel;
JTextArea input,found;
JTextArea[] outputs;
String[] starts;
CrawlManager cm;
public void start()
{
String text = input.getText();
starts = text.split("\n");
cm = new CrawlManager(starts,outputs);
cm.start();
}
public void stop()
{
cm.stop();
ArrayList urls = cm.getUrls();
try
{
PrintWriter pw = new PrintWriter(new FileOutputStream("F:\\results.txt"));
for(int x = 0;x<urls.size();x++)
{
pw.println((String)urls.get(x));
}
pw.flush();
pw.close();
}
catch(Exception ex)
{
}
}
public void actionPerformed(ActionEvent e)
{
String a = e.getActionCommand();
if(a=="Start")
{
start();
}
else if(a=="Stop")
{
stop();
}
}
public static void main (String argv[])
{
new WebCrawler().go();
}
public void go()
{
frame = new JFrame();
frame.setTitle("Sean's Crawler");
Dimension dim = Toolkit.getDefaultToolkit().getScreenSize();
frame.setBounds(0,0,dim.width,dim.height);
panel = new JPanel();
panel.setLayout(null);
input = new JTextArea();
input.setBounds(0,0,frame.getWidth()-500,frame.getHeight()/2-100);
panel.add(input);
found = new JTextArea();
DefaultCaret caret = (DefaultCaret)found.getCaret();
caret.setUpdatePolicy(DefaultCaret.ALWAYS_UPDATE);
found.setLineWrap(true);
JScrollPane jsp = new JScrollPane(found);
jsp.setBounds(frame.getWidth()-500,50,200,frame.getHeight()/2-100);
jsp.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_ALWAYS);
found.setEditable(false);
panel.add(jsp);
JLabel l = new JLabel("URL's Found: ");
l.setBounds(frame.getWidth()-500,0,200,50);
panel.add(l);
outputs = new JTextArea[5];
outputs = createOutputs();
outputs[4] = found;
JButton start = new JButton("Start");
start.addActionListener(this);
start.setBounds(frame.getWidth()-300,0,300,frame.getHeight()/2-5);
panel.add(start);
JButton stop = new JButton("Stop");
stop.setBounds(frame.getWidth()-250,frame.getHeight()/2+5,250,frame.getHeight()/2-5);
stop.addActionListener(this);
panel.add(stop);
frame.getContentPane().add(panel);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
frame.setVisible(true);
}
public JTextArea[] createOutputs()
{
for(int x = 0;x<4;x++)
{
JTextArea output = new JTextArea();
DefaultCaret caret = (DefaultCaret)output.getCaret();
caret.setUpdatePolicy(DefaultCaret.ALWAYS_UPDATE);
output.setLineWrap(true);
JScrollPane jsp = new JScrollPane(output);
jsp.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_ALWAYS);
output.setEditable(false);
jsp.setBounds(x*200,frame.getHeight()/2+5,200,frame.getHeight()/2-5);
JLabel l = new JLabel("Worker : "+x);
l.setBounds(x*200,frame.getHeight()/2-50,100,100);
panel.add(l);
panel.add(jsp);
outputs[x] = output;
}
return outputs;
}
}
import javax.swing.JTextArea;
import java.util.ArrayList;
public class CrawlManager
{
String[] starts;
Crawler[] crawlers;
Updater upd;
ArrayList urls;
public CrawlManager(String[] starts,JTextArea[] outputs)
{
this.starts = starts;
urls = new ArrayList();
switch(starts.length)
{
case 1:
{
crawlers = new Crawler[1];
crawlers[0] = new Crawler(starts[0]);
break;
}
case 2:
{
crawlers = new Crawler[2];
crawlers[0] = new Crawler(starts[0]);
crawlers[1] = new Crawler(starts[1]);
break;
}
case 3:
{
crawlers = new Crawler[3];
crawlers[0] = new Crawler(starts[0]);
crawlers[1] = new Crawler(starts[1]);
crawlers[2] = new Crawler(starts[2]);
break;
}
case 4:
{
crawlers = new Crawler[4];
crawlers[0] = new Crawler(starts[0]);
crawlers[1] = new Crawler(starts[1]);
crawlers[2] = new Crawler(starts[2]);
crawlers[3] = new Crawler(starts[3]);
break;
}
default:
{
throw new RuntimeException("You have submitted a wrong value!");
}
}
upd = new Updater(crawlers,outputs,this);
}
public ArrayList getUrls()
{
return urls;
}
public void start()
{
if(crawlers!=null)
{
for(int x = 0;x<starts.length;x++)
{
crawlers[x].start();
}
}
upd.start();
}
public void stop()
{
upd.stop();
for(int x = 0;x<starts.length;x++)
{
ArrayList s = crawlers[x].searched;
for(int d = 0;d<s.size();d++)
{
urls.add(s.get(d));
}
crawlers[x] = null;
}
}
}
import java.util.*;
import java.net.*;
import java.io.*;
public class Crawler extends Thread
{
ArrayList toSearch,searched;
String start;
String log;
public Crawler(String start)
{
this.start = start;
toSearch = new ArrayList();
searched = new ArrayList();
}
public String getStatus()
{
String j = log;
log = new String();
return j;
}
private void setStatus(String text)
{
log+=text;
}
boolean robotSafe(URL url)
{
String strHost = url.getHost();
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try
{
urlRobot = new URL(strRobot);
}
catch (MalformedURLException e)
{
return false;
}
String strCommands;
try
{
InputStream urlRobotStream = urlRobot.openStream();
strCommands = new String();
Scanner f = new Scanner(urlRobotStream);
while(f.hasNext())
{
strCommands+=f.next();
}
urlRobotStream.close();
f.close();
}
catch (IOException e)
{
return true;
}
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf("Disallow:", index)) != -1)
{
index += "Disallow:".length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.hasMoreTokens())
break;
String strBadPath = st.nextToken();
if (strURL.indexOf(strBadPath) == 0)
return false;
}
return true;
}
public void run()
{
String strURL = start;
toSearch.clear();
searched.clear();
toSearch.add(strURL);
while (toSearch.size() > 0)
{
strURL = (String) toSearch.get(0);
setStatus("searching " + strURL);
URL url;
try
{
url = new URL(strURL);
}
catch (MalformedURLException e)
{
setStatus("ERROR: invalid URL at line 92" + strURL);
break;
}
toSearch.remove(0);
searched.add(strURL);
if (url.getProtocol().compareTo("http") != 0)
{
setStatus("Break at line 99");
break;
}
if (!robotSafe(url))
{
setStatus("Not Robot safe!");
break;
}
try
{
URLConnection urlConnection = url.openConnection();
setStatus("Opening URL...");
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String content = new String();
Scanner f = new Scanner(urlStream);
setStatus("Gathering data...");
while(f.hasNext())
{
content+=f.nextLine();
}
urlStream.close();
f.close();
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
{
setStatus("Break at line 128");
break;
}
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
{
setStatus("Break at line 133");
break;
}
setStatus("Finding links...");
index++;
String remaining = content.substring(index);
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try
{
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
}
catch (MalformedURLException e)
{
setStatus("ERROR: bad URL at line 149" + strLink);
continue;
}
if (urlLink.getProtocol().compareTo("http") != 0)
{
setStatus("Break at line 154");
break;
}
if ((!searched.contains(strLink)) && (!toSearch.contains(strLink)))
{
if (robotSafe(urlLink))
{
toSearch.add(strLink);
setStatus("Added "+strLink);
}
}
}
}
catch (IOException e)
{
setStatus("ERROR: couldn't open URL at line 139" + strURL);
break;
}
}
}
}
import javax.swing.JTextArea;
import java.util.ArrayList;
public class Updater implements Runnable
{
Crawler[] crawlers;
boolean stop;
Thread t;
JTextArea[] outputs;
CrawlManager cm;
ArrayList posted;
public Updater(Crawler[] crawlers,JTextArea[] outputs,CrawlManager cm)
{
this.crawlers = crawlers;
stop = false;
this.outputs = outputs;
posted = new ArrayList();
this.cm = cm;
}
public void start()
{
t = new Thread(this);
t.start();
}
public void stop()
{
stop = true;
if(t!=null)
{
t = null;
}
for(int x = 0;x<outputs.length;x++)
{
outputs[x].setText(null);
outputs[x].append("Stopped!");
}
}
public void run()
{
while(stop==false)
{
for(int x = 0;x<crawlers.length;x++)
{
ArrayList urls = crawlers[x].searched;
for(int f = 0;f<urls.size();f++)
{
String a = (String)urls.get(f);
if(posted.contains(a)==false)
{
outputs[4].append(a);
posted.add(a);
}
}
}
for(int x = 0;x<crawlers.length;x++)
{
String text = crawlers[x].getStatus();
outputs[x].append(text);
try
{
crawlers[x].sleep(100);
}
catch(Exception ex)
{
}
}
}
}
}