From 4c5c482951f7ca825593218deec26fbe82ee8f20 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 22 Mar 2004 17:14:33 +0000 Subject: [PATCH 01/10] This commit was manufactured by cvs2svn to create branch 'hen'. From b68ef8f25fca6cbda300aaf500f4f70a3e1c8422 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 22 Mar 2004 17:14:40 +0000 Subject: [PATCH 02/10] This commit was manufactured by cvs2svn to create branch 'hen'. --- runsql/example/config/CmdLine.properties | 5 + runsql/example/jndi.properties | 2 + runsql/example/sql/BOOTSTRAP.sql | 4 + runsql/example/sql/CHECK.sql | 1 + runsql/example/sql/QUERY.sql | 1 + runsql/example/sql/SCHEMA.sql | 11 +++ runsql/project.properties | 6 ++ runsql/project.xml | 92 +++++++++++++++++++ runsql/src/java/org/osjava/runsql/RunSql.java | 67 ++++++++++++++ 9 files changed, 189 insertions(+) create mode 100644 runsql/example/config/CmdLine.properties create mode 100644 runsql/example/jndi.properties create mode 100644 runsql/example/sql/BOOTSTRAP.sql create mode 100644 runsql/example/sql/CHECK.sql create mode 100644 runsql/example/sql/QUERY.sql create mode 100644 runsql/example/sql/SCHEMA.sql create mode 100644 runsql/project.properties create mode 100644 runsql/project.xml create mode 100644 runsql/src/java/org/osjava/runsql/RunSql.java diff --git a/runsql/example/config/CmdLine.properties b/runsql/example/config/CmdLine.properties new file mode 100644 index 00000000..2bc4007a --- /dev/null +++ b/runsql/example/config/CmdLine.properties @@ -0,0 +1,5 @@ +com.generationjava.jndi.datasource=true +driver=org.hsqldb.jdbcDriver +url=jdbc:hsqldb:hsql://localhost +user=sa +password= diff --git a/runsql/example/jndi.properties b/runsql/example/jndi.properties new file mode 100644 index 00000000..4b7b0e4b --- /dev/null +++ b/runsql/example/jndi.properties @@ -0,0 +1,2 @@ +java.naming.factory.initial=com.generationjava.jndi.PropertiesFactory +com.generationjava.jndi.root=config/ diff --git a/runsql/example/sql/BOOTSTRAP.sql b/runsql/example/sql/BOOTSTRAP.sql new file mode 100644 index 00000000..9373894c --- /dev/null +++ b/runsql/example/sql/BOOTSTRAP.sql @@ -0,0 +1,4 @@ +INSERT INTO Report (name, sql, age) VALUES ('foo', 'select foo', '95'); +INSERT INTO Report (name, sql, age) VALUES ('bar', 'select poo', '5'); +INSERT INTO Report (name, sql, age) VALUES ('poo', 'delete foo', '9'); +INSERT INTO Report (name, sql, age) VALUES ('barry', 'select wfoo', '9191'); diff --git a/runsql/example/sql/CHECK.sql b/runsql/example/sql/CHECK.sql new file mode 100644 index 00000000..9be0a1fc --- /dev/null +++ b/runsql/example/sql/CHECK.sql @@ -0,0 +1 @@ +SELECT count(*) AS total FROM Report diff --git a/runsql/example/sql/QUERY.sql b/runsql/example/sql/QUERY.sql new file mode 100644 index 00000000..f8cb6ce8 --- /dev/null +++ b/runsql/example/sql/QUERY.sql @@ -0,0 +1 @@ +SELECT * FROM Bandwidth diff --git a/runsql/example/sql/SCHEMA.sql b/runsql/example/sql/SCHEMA.sql new file mode 100644 index 00000000..bba7192d --- /dev/null +++ b/runsql/example/sql/SCHEMA.sql @@ -0,0 +1,11 @@ +DROP TABLE Bandwidth; +CREATE TABLE Bandwidth ( + last_total varchar(255), + this_total varchar(255), + last_in varchar(255), + this_in varchar(255), + last_out varchar(255), + this_out varchar(255), + last_sess varchar(255), + this_sess varchar(255) +); diff --git a/runsql/project.properties b/runsql/project.properties new file mode 100644 index 00000000..18a9eb1e --- /dev/null +++ b/runsql/project.properties @@ -0,0 +1,6 @@ +maven.checkstyle.header.file=/dev/null +maven.repo.remote=http://www.generationjava.com/jars/,http://www.ibiblio.org/maven/ + +maven.ui.banner.background=#000 +maven.ui.section.background=#000 +maven.ui.subsection.background=#000 diff --git a/runsql/project.xml b/runsql/project.xml new file mode 100644 index 00000000..f07a0e21 --- /dev/null +++ b/runsql/project.xml @@ -0,0 +1,92 @@ + + + + 3 + runsql + runsql + 0.1 + + GenerationJava + http://www.generationjava.com/ + /images/initials.jpg + + 2003 + org.osjava.runsql + /images/core-logo.jpg + + + A command line runner of SQL statements. I like piping sql statements into the mysql command on a unix machine. This is designed to let me do the same things in a database independent way. + + + SQL Script Runner + + http://www.osjava.org/runsql/ + www.osjava.org/runsql + /sites/org/osjava/www/runsql/ + /sites/org/osjava/www/builds/releases/runsql/ + + + scm:cvshen@umbongo.flamefew.net:/var/cvs:runsql + http://www.generationjava.com/view-cvs/viewcvs.cgi/runsql/ + + + + + 0.1 + 0.1 + HEAD + + + + + + + + + + + + Henri Yandell + hen + bayard@generationjava.com + GenerationJava + + Java Developer + + + + + + + + commons-dbutils + SNAPSHOT + + + genjava-core + 2.0 + http://www.generationjava.com/projects/GenJavaCore.shtml + + + commons-lang + 1.0 + http://jakarta.apache.org/commons/lang.html + + + + + + bayard@www.generationjava.com + + src/java + + src/test + + + + diff --git a/runsql/src/java/org/osjava/runsql/RunSql.java b/runsql/src/java/org/osjava/runsql/RunSql.java new file mode 100644 index 00000000..27885c7d --- /dev/null +++ b/runsql/src/java/org/osjava/runsql/RunSql.java @@ -0,0 +1,67 @@ +package org.osjava.runsql; + +import java.io.*; +import java.sql.*; +import javax.sql.*; +import javax.naming.*; +import org.apache.commons.dbutils.*; +import org.apache.commons.lang.*; +import com.generationjava.io.*; + +public class RunSql { + + static public void main(String[] args) throws Exception { + String dsname = args[0]; + InitialContext ctxt = new InitialContext(); + DataSource ds = (DataSource)ctxt.lookup(dsname); + Connection conn = ds.getConnection(); + try { + runScript(conn, System.in); + } catch(SQLException sqle) { + System.err.println("Script failure. "); + sqle.printStackTrace(); + } finally { + DbUtils.closeQuietly(conn); + } + } + + static public void runScript(Connection conn, InputStream in) throws SQLException { + runScript(conn, FileW.loadFile(in)); + } + + static public void runScript(Connection conn, String script) throws SQLException { + // TODO: Improve so a ; inside a string is ignored. + String[] stmts = StringUtils.split(script, ";"); + Statement stmt = conn.createStatement(); + try { + for(int i=0; i Date: Mon, 22 Mar 2004 17:14:41 +0000 Subject: [PATCH 03/10] This commit was manufactured by cvs2svn to create branch 'hen'. --- scraping-engine/TODO | 45 ++++++ scraping-engine/project.properties | 7 + scraping-engine/project.xml | 152 ++++++++++++++++++ scraping-engine/run.sh | 1 + scraping-engine/runc.sh | 1 + .../org/osjava/scraping/AbstractConfig.java | 70 ++++++++ .../org/osjava/scraping/AbstractPage.java | 66 ++++++++ .../org/osjava/scraping/AbstractParser.java | 13 ++ .../org/osjava/scraping/CheckingParser.java | 23 +++ .../src/java/org/osjava/scraping/Config.java | 19 +++ .../org/osjava/scraping/ConfigFactory.java | 9 ++ .../src/java/org/osjava/scraping/Engine.java | 93 +++++++++++ .../src/java/org/osjava/scraping/Fetcher.java | 12 ++ .../osjava/scraping/FetchingException.java | 12 ++ .../org/osjava/scraping/FetchingFactory.java | 21 +++ .../src/java/org/osjava/scraping/Header.java | 7 + .../java/org/osjava/scraping/HttpFetcher.java | 34 ++++ .../org/osjava/scraping/HttpsFetcher.java | 53 ++++++ .../java/org/osjava/scraping/JdbcStore.java | 62 +++++++ .../java/org/osjava/scraping/JndiConfig.java | 35 ++++ .../java/org/osjava/scraping/LoopParser.java | 24 +++ .../java/org/osjava/scraping/MemoryPage.java | 19 +++ .../java/org/osjava/scraping/MultiParser.java | 26 +++ .../java/org/osjava/scraping/MultiResult.java | 55 +++++++ .../org/osjava/scraping/NamespaceSession.java | 30 ++++ .../scraping/NotificationException.java | 12 ++ .../osjava/scraping/NotificationFactory.java | 28 ++++ .../java/org/osjava/scraping/Notifier.java | 11 ++ .../java/org/osjava/scraping/NullFetcher.java | 9 ++ .../org/osjava/scraping/NullNotifier.java | 8 + .../java/org/osjava/scraping/NullPage.java | 16 ++ .../java/org/osjava/scraping/NullParser.java | 12 ++ .../java/org/osjava/scraping/NullResult.java | 12 ++ .../java/org/osjava/scraping/NullStore.java | 24 +++ .../src/java/org/osjava/scraping/Page.java | 15 ++ .../src/java/org/osjava/scraping/Parser.java | 8 + .../org/osjava/scraping/ParserFactory.java | 16 ++ .../org/osjava/scraping/ParsingException.java | 12 ++ .../java/org/osjava/scraping/QuartzJob.java | 24 +++ .../org/osjava/scraping/QuartzScheduler.java | 91 +++++++++++ .../src/java/org/osjava/scraping/Result.java | 9 ++ .../src/java/org/osjava/scraping/Runner.java | 7 + .../java/org/osjava/scraping/Scheduler.java | 7 + .../org/osjava/scraping/SchedulerFactory.java | 27 ++++ .../src/java/org/osjava/scraping/Session.java | 14 ++ .../org/osjava/scraping/SimpleScheduler.java | 12 ++ .../src/java/org/osjava/scraping/Store.java | 9 ++ .../org/osjava/scraping/StoreFactory.java | 16 ++ .../org/osjava/scraping/StoringException.java | 12 ++ .../org/osjava/scraping/TabularResult.java | 20 +++ .../scraping/parser/PassThroughParser.java | 20 +++ .../osjava/scraping/util/FactoryUtils.java | 34 ++++ 52 files changed, 1374 insertions(+) create mode 100644 scraping-engine/TODO create mode 100644 scraping-engine/project.properties create mode 100644 scraping-engine/project.xml create mode 100755 scraping-engine/run.sh create mode 100755 scraping-engine/runc.sh create mode 100644 scraping-engine/src/java/org/osjava/scraping/AbstractConfig.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/AbstractPage.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/AbstractParser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/CheckingParser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Config.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/ConfigFactory.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Engine.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Fetcher.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/FetchingException.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/FetchingFactory.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Header.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/HttpFetcher.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/HttpsFetcher.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/JdbcStore.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/JndiConfig.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/LoopParser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/MemoryPage.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/MultiParser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/MultiResult.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NamespaceSession.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NotificationException.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NotificationFactory.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Notifier.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NullFetcher.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NullNotifier.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NullPage.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NullParser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NullResult.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/NullStore.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Page.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Parser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/ParserFactory.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/ParsingException.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/QuartzJob.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/QuartzScheduler.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Result.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Runner.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Scheduler.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/SchedulerFactory.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Session.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/SimpleScheduler.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/Store.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/StoreFactory.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/StoringException.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/TabularResult.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/parser/PassThroughParser.java create mode 100644 scraping-engine/src/java/org/osjava/scraping/util/FactoryUtils.java diff --git a/scraping-engine/TODO b/scraping-engine/TODO new file mode 100644 index 00000000..d6b6a601 --- /dev/null +++ b/scraping-engine/TODO @@ -0,0 +1,45 @@ +Hide Config behind Session?? + +Implement Store + using hsqldb [DONE. Needs testing] +Implement Notification + [DONE. Needs implementation of EmailNotifier] +Implement Scheduler + [DONE. Does not fulfill all requirements yet] +::: +All must be pluggable + +Implement a 'REPLACE' statement style Store, which tries an update, then +an insert. or vice-versa. + +Different threads for db than io?? + +ClassLoader for parsers +Scheduler needs to check config to figure out if it should reschedule + +Schedulers need to understand: + +cron +'startup' +interval based +[ALL DONE] +Also need to understand the concept of: Run until success or Time. + +Pages need to know their URIs + +CheckingParsers need to be thought about. +Sometimes the Check might want to be a lot of times. +Maybe a Checker exists, but not a CheckingParser. So Parsers +can call Checkers. + +Or: A Parser has a precondition/postcondition. These are the checkers. + +Specify a Page content-type. ie) loose html, xhtml, csv, xml, xls etc. + This would lead to a sub-class of Page in reply + +Long-term: + +Provide a scraping language +Provide a design GUI +Provide an automatic analyser. Looks for tables with high levels of +content. Looks for comments etc. diff --git a/scraping-engine/project.properties b/scraping-engine/project.properties new file mode 100644 index 00000000..f498c725 --- /dev/null +++ b/scraping-engine/project.properties @@ -0,0 +1,7 @@ +maven.checkstyle.header.file=/dev/null +maven.repo.remote=http://www.generationjava.com/jars2/,http://www.ibiblio.org/maven/ + +maven.ui.banner.background=#000 +maven.ui.section.background=#000 +maven.ui.subsection.background=#000 + diff --git a/scraping-engine/project.xml b/scraping-engine/project.xml new file mode 100644 index 00000000..087a87bf --- /dev/null +++ b/scraping-engine/project.xml @@ -0,0 +1,152 @@ + + + + 3 + scraping-engine + scraping-engine + 0.1 + + GenerationJava + http://www.generationjava.com/ + /images/initials.jpg + + 2003 + com.generationjava.jndi + /images/core-logo.jpg + + + scraping-engine + + + A simple implementation of JNDI. It is entirely library based, so no server instances are started, and it sits upon Java .properties files, so it is easy to use and simple to understand. The .properties files may be either on the file system or in the classpath. + + + A JNDI implementation using .properties + + http://www.generationjava.com/ + + www.generationjava.com/maven/scraping-engine + /sites/com/generationjava/www/maven/genjava/scraping-engine/ + /sites/com/generationjava/www/maven/builds/ + + + scm:cvshen@umbongo.flamefew.net:/var/cvs:scraping-engine + http://www.generationjava.com/view-cvs/viewcvs.cgi/scraping-engine/ + + + + + 1.0 + 1.0 + HEAD + + + + + + + + + + + + Henri Yandell + hen + bayard@generationjava.com + GenerationJava + + Java Developer + + + + + + + + + + + + + log4j + 1.2.7 + http://jakarta.apache.org/log4j/ + + + commons-lang + 1.0 + http://jakarta.apache.org/commons/lang.html + + + commons-httpclient + 2.0-alpha2 + http://jakarta.apache.org/commons/ + + + commons-collections + 2.1 + http://jakarta.apache.org/commons/ + + + commons-dbutils + SNAPSHOT + http://jakarta.apache.org/commons/ + + + genjava-core + 2.0 + http://www.generationjava.com/ + + + simple-jndi + 0.5 + http://www.generationjava.com/ + + + quartz + 1.0.7 + http://www.part.net/quartz.html + + + + + + bayard@www.generationjava.com + + src/java + + src/test + + + + + + + include = *.dtd + include = log4j.properties + + + + + + + diff --git a/scraping-engine/run.sh b/scraping-engine/run.sh new file mode 100755 index 00000000..92290745 --- /dev/null +++ b/scraping-engine/run.sh @@ -0,0 +1 @@ +java -classpath .:target/scraping-engine-0.1.jar:/usr/local/javalib/commons-collections-2.1.jar:/usr/local/javalib/commons-lang-1.0.1.jar:/usr/local/javalib/genjava-core-2.0.jar:config/:/usr/local/javalib/maven/repository/simple-jndi/jars/simple-jndi-0.5.jar:/usr/local/javalib/commons-httpclient-2.0-alpha2.jar:/usr/local/javalib/commons-logging-1.0.1.jar:/usr/local/javalib/maven/repository/commons-dbutils/jars/commons-dbutils-SNAPSHOT.jar:/usr/local/javalib/mm.mysql-2.0.9-bin.jar:/Users/hen/Desktop/hsqldb/lib/hsqldb.jar:/usr/local/javalib/maven/repository/log4j/jars/log4j-1.2.7.jar:/usr/local/javalib/maven/repository/quartz/jars/quartz-1.0.7.jar org.osjava.scraping.Engine diff --git a/scraping-engine/runc.sh b/scraping-engine/runc.sh new file mode 100755 index 00000000..6837e24d --- /dev/null +++ b/scraping-engine/runc.sh @@ -0,0 +1 @@ +javac -classpath /usr/local/javalib/commons-lang-1.0.1.jar:/usr/local/javalib/genjava-core-2.0.jar:target/scraping-engine-0.1.jar:.:/usr/local/javalib/maven/repository/log4j/jars/log4j-1.2.7.jar com/*/scraper/*.java diff --git a/scraping-engine/src/java/org/osjava/scraping/AbstractConfig.java b/scraping-engine/src/java/org/osjava/scraping/AbstractConfig.java new file mode 100644 index 00000000..0ef2a1aa --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/AbstractConfig.java @@ -0,0 +1,70 @@ +package org.osjava.scraping; + +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.lang.NumberUtils; +import java.util.Date; + +public abstract class AbstractConfig implements Config { + + private String context = ""; + + protected abstract Object getValue(String key); + + public Object get(String key) { + return getValue( getContext()+key ); + } + + public boolean has(String key) { + return (get(key) != null); + } + + public Object getAbsolute(String key) { + return getValue(key); + } + + public String getString(String key) { + return (String)get(key); + } + + public Date getDate(String key) { + try { + return java.text.DateFormat.getDateInstance(java.text.DateFormat.SHORT).parse(key); + } catch(java.text.ParseException pe) { + return null; + } + } + + // rely on simple-jndi's type + public int getInt(String key) { + return NumberUtils.stringToInt(getString(key)); + } + + public List getList(String key) { + Object obj = get(key); + if(!(obj instanceof List)) { + List list = new ArrayList(1); + list.add(obj); + obj = list; + } + return (List)obj; + } + + public void setContext(String context) { + this.context = context; + } + + public String getContext() { + return this.context; + } + + public Config cloneConfig() { + try { + return (Config)this.clone(); + } catch(CloneNotSupportedException cnse) { + // ignore + throw new RuntimeException("Cloning of a Config failed. This should be impossible. "); + } + } + +} diff --git a/scraping-engine/src/java/org/osjava/scraping/AbstractPage.java b/scraping-engine/src/java/org/osjava/scraping/AbstractPage.java new file mode 100644 index 00000000..7aec7eab --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/AbstractPage.java @@ -0,0 +1,66 @@ +package org.osjava.scraping; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import java.io.BufferedReader; + +import org.apache.log4j.Logger; + +public abstract class AbstractPage implements Page { + + private static Logger logger = Logger.getLogger(AbstractPage.class); + + private String documentBase; + + public AbstractPage() { + } + + public abstract Reader read() throws IOException; + + public Page fetch(String uri, Config cfg, Session session) throws FetchingException { + int idx = uri.indexOf("://"); + if(idx == -1) { + // TODO: also check it is less than 15 or something?? + uri = this.documentBase + "/" + uri; + } + logger.debug("Fetching: "+uri); + Fetcher fetcher = FetchingFactory.getFetcher(cfg, session); + Page page = fetcher.fetch(uri, cfg, session); + return page; + } + + public void setDocumentBase(String documentBase) { + logger.debug("Document base: "+documentBase); + this.documentBase = documentBase; + } + + public String getDocumentBase() { + return this.documentBase; + } + + public String readAsString() throws IOException { + Reader rdr = null; + try { + rdr = this.read(); + BufferedReader bfr = new BufferedReader(rdr); + StringBuffer buffer = new StringBuffer(); + String line = ""; + while( (line = bfr.readLine()) != null) { + buffer.append(line); + buffer.append("\n"); + } + return buffer.toString(); + } finally { + if(rdr != null) { + try { + rdr.close(); + } catch(IOException ioe) { + // ignore + } + } + } + } + +} diff --git a/scraping-engine/src/java/org/osjava/scraping/AbstractParser.java b/scraping-engine/src/java/org/osjava/scraping/AbstractParser.java new file mode 100644 index 00000000..3ed3099d --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/AbstractParser.java @@ -0,0 +1,13 @@ +package org.osjava.scraping; + +public abstract class AbstractParser implements Parser { + + abstract public Result parse(Page page, Config cfg, Session session) throws ParsingException; + + public void startUp(Config cfg) throws Exception { } + public void bringDown(Config cfg) throws Exception { } + + // helper methods + // ?? + +} diff --git a/scraping-engine/src/java/org/osjava/scraping/CheckingParser.java b/scraping-engine/src/java/org/osjava/scraping/CheckingParser.java new file mode 100644 index 00000000..3344505d --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/CheckingParser.java @@ -0,0 +1,23 @@ +package org.osjava.scraping; + +public abstract class CheckingParser extends AbstractParser { + + public Result parse(Page page, Config cfg, Session session) throws ParsingException { + Header header = parseHeader(page, cfg, session); + Store store = StoreFactory.getStore(cfg, session); + try { + boolean found = store.exists(header, cfg, session); + if(found) { + return new NullResult(); + } else { + return parseBody(page, header, cfg, session); + } + } catch(StoringException se) { + return new NullResult(); + } + } + + public abstract Header parseHeader(Page page, Config cfg, Session session) throws ParsingException; + public abstract Result parseBody(Page page, Header header, Config cfg, Session session) throws ParsingException; + +} diff --git a/scraping-engine/src/java/org/osjava/scraping/Config.java b/scraping-engine/src/java/org/osjava/scraping/Config.java new file mode 100644 index 00000000..088fa178 --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/Config.java @@ -0,0 +1,19 @@ +package org.osjava.scraping; + +import java.util.List; +import java.util.Date; + +public interface Config extends Cloneable { + + public Object get(String key); + public boolean has(String key); + public Object getAbsolute(String key); + public String getString(String key); + public int getInt(String key); + public Date getDate(String key); + public List getList(String key); + public void setContext(String context); + public String getContext(); + public Config cloneConfig(); + +} diff --git a/scraping-engine/src/java/org/osjava/scraping/ConfigFactory.java b/scraping-engine/src/java/org/osjava/scraping/ConfigFactory.java new file mode 100644 index 00000000..e501d1e6 --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/ConfigFactory.java @@ -0,0 +1,9 @@ +package org.osjava.scraping; + +public class ConfigFactory { + + static public Config getConfig(String[] args) { + return new JndiConfig(); + } + +} diff --git a/scraping-engine/src/java/org/osjava/scraping/Engine.java b/scraping-engine/src/java/org/osjava/scraping/Engine.java new file mode 100644 index 00000000..fa78e367 --- /dev/null +++ b/scraping-engine/src/java/org/osjava/scraping/Engine.java @@ -0,0 +1,93 @@ +package org.osjava.scraping; + +import java.util.List; +import java.io.Reader; + +public class Engine implements Runner { + + public static void main(String[] args) { + Engine engine = new Engine(); + engine.run(args); + } + +/// TODO: Implement the Scheduler aspect +/// The Scheduler notifies only this class. It is +/// then up to this Engine to run the parsers. +/// TODO: Put the scraping and db in a different thread + public void run(String[] args) { + // load the config + Config cfg = ConfigFactory.getConfig(args); + + + // test and how schedule=startup will be handled + List list = cfg.getList("org.osjava.scrapers"); + for(int i=0; i