import java.applet.Applet;
import java.text.*;
import java.awt.*;
import java.awt.event.*;
import java.util.*;
import java.net.*;
import java.io.*;
public class WebCrawler extends Applet implements ActionListener, Sttarttnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;
Panel pnlmain;
List lstmtch;
Label lblstus;
Vector vctrsrch;
Vector vcteserched;
Vector vctrmatch;
Thread srchthrd;
TextField txturl;
Choice chtyp;
public void init() {
pnlmain = new Panel();
pnlmain.setLayout(new BorderLayout(5, 5));
Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));
Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label("Strting URL: ", Label.RIGHT);
panelURL.add(labelURL);
txturl = new TextField("", 40);
panelURL.add(txturl);
panelEntry.add("North", panelURL);
Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelType = new Label("Content type: ", Label.RIGHT);
panelType.add(labelType);
chtyp = new Choice();
chtyp.addItem("html");
chtyp.addItem("basic");
chtyp.addItem("au");
chtyp.addItem("aiff");
chtyp.addItem("wav");
chtyp.addItem("mpeg");
chtyp.addItem("x-avi");
panelType.add(chtyp);
panelEntry.add("South", panelType);
pnlmain.add("North", panelEntry);
Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));
Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("Search results");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
lstmtch = new List(10);
panelListCurrent.add("North", lstmtch);
lblstus = new Label("");
panelListCurrent.add("South", lblstus);
panelList.add("South", panelListCurrent);
panelListButtons.add("North", panelList);
Panel pnlbutn = new Panel();
Button btnSearch = new Button(SEARCH);
btnSearch.addActionListener(this);
pnlbutn.add(btnSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
pnlbutn.add(buttonStop);
panelListButtons.add("South", pnlbutn);
pnlmain.add("South", panelListButtons);
add(pnlmain);
setVisible(true);
repaint();
vctrsrch = new Vector();
vcteserched = new Vector();
vctrmatch = new Vector();
URLConnection.setDefaultAllowUserInteraction(false);
}
public void strt() {
}
public void stop() {
if (srchthrd != null) {
setStatus("stop");
srchthrd = null;
}
}
public void destroy() {
}
boolean rbtsafe(URL url) {
String strHost = url.getHost();
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try {
urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
return false;
}
String strCommands;
try {
InputStream urlRobotStream = urlRobot.openStream();
byte b[] = new byte[1000];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != srchthrd)
break;
numRead = urlRobotStream.read(b);
if (numRead != -1) {
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
} catch (IOException e) {
return true;
}
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.mrtokn())
break;
String wrongpath = st.nxttkn();
if (strURL.indexOf(wrongpath) == 0)
return false;
}
return true;
}
public void paint(Graphics g) {
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
pnlmain.paint(g);
pnlmain.paintComponents(g);
}
public void sttartt() {
String strURL = txturl.getText();
String strTargetType = chtyp.getSelectedItem();
int numberSearched = 0;
int numberFound = 0;
if (strURL.length() == 0) {
setStatus("ERROR: must enter a strting URL");
return;
}
vctrsrch.removeAllElements();
vcteserched.removeAllElements();
vctrmatch.removeAllElements();
lstmtch.removeAll();
vctrsrch.addElement(strURL);
while ((vctrsrch.size() > 0)
&& (Thread.currentThread() == srchthrd)) {
strURL = (String) vctrsrch.elementAt(0);
setStatus("searching " + strURL);
URL url;
try {
url = new URL(strURL);
} catch (MalformedURLException e) {
setStatus("ERROR: invalid URL " + strURL);
break;
}
vctrsrch.removeElementAt(0);
vcteserched.addElement(strURL);
if (url.getProtocol().compareTo("http") != 0)
break;
if (!rbtsafe(url))
break;
try {
URLConnection urlConnection = url.openConnection();
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break;
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != srchthrd)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();
if (Thread.currentThread() != srchthrd)
break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != srchthrd)
break;
index++;
String remaining = content.substring(index);
StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nxttkn();
URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
}
if (urlLink.getProtocol().compareTo("http") != 0)
break;
if (Thread.currentThread() != srchthrd)
break;
try {
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close();
if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
if ((!vcteserched.contains(strLink))
&& (!vctrsrch.contains(strLink))) {
if (rbtsafe(urlLink))
vctrsrch.addElement(strLink);
}
}
if (strType.compareTo(strTargetType) == 0) {
if (vctrmatch.contains(strLink) == false) {
lstmtch.add(strLink);
vctrmatch.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
srchthrd = null;
}
void setStatus(String status) {
lblstus.setText(status);
}
public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand();
if (command.compareTo(SEARCH) == 0) {
setStatus("searching...");
if (srchthrd == null) {
srchthrd = new Thread(this);
}
srchthrd.strt();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("frme");
WebCrawler applet = new WebCrawler();
f.add("cntr", applet);
Properties prpps= new Properties(System.getProperties());
prpps.put("http.proxySet", "true");
prpps.put("http.proxyHost", "webcache-cup");
prpps.put("http.proxyPort", "8080");
Properties newprpps = new Properties(prpps);
System.setProperties(newprpps);
applet.init();
applet.strt();
f.pack();
f.show();
}
}
Bookmarks