|
The Encog Project | ||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||
java.lang.Objectorg.encog.bot.spider.workload.sql.SQLWorkloadManager
public class SQLWorkloadManager
SQLWorkloadManager: This workload manager stores the URL lists in an SQL database. This workload manager uses two tables, which can be created as follows: CREATE TABLE 'spider_host' ( 'host_id' int(10) unsigned NOT NULL auto_increment, 'host' varchar(255) NOT NULL default '', 'status' varchar(1) NOT NULL default '', 'urls_done' int(11) NOT NULL, 'urls_error' int(11) NOT NULL, PRIMARY KEY ('host_id') ) CREATE TABLE 'spider_workload' ( 'workload_id' int(10) unsigned NOT NULL auto_increment, 'host' int(10) unsigned NOT NULL, 'url' varchar(2083) NOT NULL default '', 'status' varchar(1) NOT NULL default '', 'depth' int(10) unsigned NOT NULL, 'url_hash' int(11) NOT NULL, 'source_id' int(11) NOT NULL, PRIMARY KEY ('workload_id'), KEY 'status' ('status'), KEY 'url_hash' ('url_hash'), KEY 'host' ('host') )
| Field Summary | |
|---|---|
static int |
HASH_MASK
The mask used to generate URL hash's. |
| Constructor Summary | |
|---|---|
SQLWorkloadManager()
|
|
| Method Summary | |
|---|---|
boolean |
add(java.net.URL url,
java.net.URL source,
int depth)
Add the specified URL to the workload. |
void |
clear()
Clear the workload. |
void |
close()
Close the workload manager. |
boolean |
contains(java.net.URL url)
Determine if the workload contains the specified URL. |
java.net.URL |
convertURL(java.lang.String aurl)
Convert the specified String to a URL. |
SQLHolder |
createSQLHolder()
Create the correct type of SQL holder for this workload managers. |
int |
getColumnSize(java.lang.String table,
java.lang.String column)
Return the size of the specified column. |
RepeatableConnection |
getConnection()
|
java.lang.String |
getCurrentHost()
Get the current host. |
int |
getDepth(java.net.URL url)
Get the depth of the specified URL. |
java.net.URL |
getSource(java.net.URL url)
Get the source page that contains the specified URL. |
java.net.URL |
getWork()
Get a new URL to work on. |
void |
init(Spider spider)
Setup this workload manager for the specified spider. |
void |
markError(java.net.URL url)
Mark the specified URL as error. |
void |
markProcessed(java.net.URL url)
Mark the specified URL as successfully processed. |
java.lang.String |
nextHost()
Move on to process the next host. |
void |
resume()
Setup the workload so that it can be resumed from where the last spider left the workload. |
void |
waitForWork(int time,
java.util.concurrent.TimeUnit unit)
If there is currently no work available, then wait until a new URL has been added to the workload. |
boolean |
workloadEmpty()
Return true if there are no more workload units. |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
|---|
public static final int HASH_MASK
| Constructor Detail |
|---|
public SQLWorkloadManager()
| Method Detail |
|---|
public boolean add(java.net.URL url,
java.net.URL source,
int depth)
add in interface WorkloadManagerurl - The URL to be added.source - The page that contains this URL.depth - The depth of this URL.
WorkloadExceptionpublic void clear()
clear in interface WorkloadManagerpublic void close()
public boolean contains(java.net.URL url)
contains in interface WorkloadManagerurl - The URL to search the workload for.
public java.net.URL convertURL(java.lang.String aurl)
convertURL in interface WorkloadManageraurl - A String to convert into a URL.
public SQLHolder createSQLHolder()
public int getColumnSize(java.lang.String table,
java.lang.String column)
table - The table that contains the column.column - The column to get the size for.
public RepeatableConnection getConnection()
public java.lang.String getCurrentHost()
getCurrentHost in interface WorkloadManagerpublic int getDepth(java.net.URL url)
getDepth in interface WorkloadManagerurl - The URL to get the depth of.
public java.net.URL getSource(java.net.URL url)
getSource in interface WorkloadManagerurl - The URL to seek the source for.
public java.net.URL getWork()
getWork in interface WorkloadManagerpublic void init(Spider spider)
init in interface WorkloadManagerspider - The spider using this workload manager. @ Thrown if there is an error
setting up the workload manager.public void markError(java.net.URL url)
markError in interface WorkloadManagerurl - The URL that had an error. @ Thrown if the specified URL could not be
marked.public void markProcessed(java.net.URL url)
markProcessed in interface WorkloadManagerurl - The URL to mark as processed. @ Thrown if the specified URL could not be
marked.public java.lang.String nextHost()
nextHost in interface WorkloadManagerpublic void resume()
resume in interface WorkloadManager
public void waitForWork(int time,
java.util.concurrent.TimeUnit unit)
waitForWork in interface WorkloadManagertime - The amount of time to wait.unit - What time unit is being used.public boolean workloadEmpty()
workloadEmpty in interface WorkloadManager
|
The Encog Project | ||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||