From ad2cba0fbc1f81711170e2e854b198678885b984 Mon Sep 17 00:00:00 2001 From: John Davi Date: Fri, 30 May 2014 17:20:04 -0700 Subject: [PATCH 1/3] Adding support for Crawlbot and V3 APIs. --- README.md | 47 ++++++++++++++++++++++++++++++++--------------- client.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- example.py | 38 +++++++++++++++++++++++--------------- 3 files changed, 106 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 5f2a805..c37c3e5 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ##Preface -Identify and extract the important parts of any web page in Python! This client currently supports calls to the automatic APIs. +Identify and extract the important parts of any web page in Python! This client currently supports calls to Diffbot's Automatic APIs and Crawlbot. Installation @@ -34,18 +34,6 @@ api = "article" response = diffbot.request(url, token, api, version=2) ``` -###Frontpage API -An example call to the Frontpage API: - -``` -diffbot = DiffbotClient() -token = "SOME_TOKEN" -version = 2 -url = "http://www.huffingtonpost.com/" -api = "frontpage" -response = diffbot.request(url, token, api, version=version) -``` - ###Product API An example call to the Product API: @@ -70,8 +58,8 @@ api = "image" response = diffbot.request(url, token, api, version=version) ``` -###Classifier API -An example call to the Classifier API: +###Analyze API +An example call to the Analyze API: ``` diffbot = DiffbotClient() @@ -82,6 +70,35 @@ api = "analyze" response = diffbot.request(url, token, api, version=version) ``` +###Crawlbot API +An example call to the Crawlbot API: + +``` +token = "SOME_TOKEN" +name = "sampleCrawlName" +seeds = "http://www.twitter.com/" +apiUrl = "analyze" +sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl) +``` + +To check the status of a crawl: + +``` +sampleCrawl.status() +``` + +To delete or restart a crawl: + +``` +sampleCrawl.delete() +sampleCrawl.restart() +``` + +To pass additional arguments to a crawl: + +``` +sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl,maxToCrawl=100,maxToProcess=50,notifyEmail="support@diffbot.com") +``` ##Testing diff --git a/client.py b/client.py index a9068f3..1925a61 100644 --- a/client.py +++ b/client.py @@ -1,11 +1,10 @@ import requests - class DiffbotClient(object): base_url = 'http://api.diffbot.com/' - def request(self, url, token, api, fields=None, version=2, **kwargs): + def request(self, url, token, api, fields=None, version=3, **kwargs): """ Returns a python object containing the requested resource from the diffbot api """ @@ -30,3 +29,53 @@ def format_version_string(version_number): Returns a string representation of the API version """ return 'v{}'.format(version_number) + +class DiffbotJob(DiffbotClient): + """ + Various calls for managing a Diffbot Crawlbot or Bulk API job. + """ + + def request(self,params): + response = requests.get(self.compose_url(self.jobType,3),params=params) + response.raise_for_status + try: + return response.json() + except: + print response.text + + def start(self,params): + response = self.request(params) + return response + + def status(self): + response = self.request(self.params) + return response + + def delete(self): + temp_params = self.params + temp_params['delete'] = 1 + response = self.request(temp_params) + return response + + def restart(self): + temp_params = self.params + temp_params['restart'] = 1 + response = self.request(temp_params) + return response + +class DiffbotCrawl(DiffbotJob): + """ + Initializes a new Diffbot crawl. Pass additional arguments as necessary. + """ + + def __init__(self,token,name,seeds,api,apiVersion=3,**kwargs): + self.params = { + "token": token, + "name": name, + } + startParams = dict(self.params) + startParams['seeds'] = seeds + startParams['apiUrl'] = self.compose_url(api,apiVersion) + startParams.update(kwargs) + self.jobType = "crawl" + self.start(startParams) \ No newline at end of file diff --git a/example.py b/example.py index 574adec..3764d43 100644 --- a/example.py +++ b/example.py @@ -1,15 +1,14 @@ -from client import DiffbotClient +from client import DiffbotClient,DiffbotCrawl from config import API_TOKEN import pprint - +import time print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN -version = 2 url = "http://shichuan.github.io/javascript-patterns/" api = "article" -response = diffbot.request(url, token, api, version=2) +response = diffbot.request(url, token, api) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) @@ -18,10 +17,9 @@ print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN -version = 2 url = "http://shichuan.github.io/javascript-patterns/" api = "article" -response = diffbot.request(url, token, api, fields=['title', 'type'], version=2) +response = diffbot.request(url, token, api, fields=['title', 'type']) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) @@ -30,10 +28,9 @@ print "Calling frontpage API endpoint on the url: http://www.huffingtonpost.com/...\n" diffbot = DiffbotClient() token = API_TOKEN -version = 2 url = "http://www.huffingtonpost.com/" api = "frontpage" -response = diffbot.request(url, token, api, version=version) +response = diffbot.request(url, token, api) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) @@ -42,10 +39,9 @@ print "Calling product API endpoint on the url: http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html...\n" diffbot = DiffbotClient() token = API_TOKEN -version = 2 url = "http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html" api = "product" -response = diffbot.request(url, token, api, version=version) +response = diffbot.request(url, token, api) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) @@ -54,10 +50,9 @@ print "Calling image API endpoint on the url: http://www.google.com/...\n" diffbot = DiffbotClient() token = API_TOKEN -version = 2 url = "http://www.google.com/" api = "image" -response = diffbot.request(url, token, api, version=version) +response = diffbot.request(url, token, api) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) @@ -66,10 +61,23 @@ print "Calling classifier API endpoint on the url: http://www.twitter.com/...\n" diffbot = DiffbotClient() token = API_TOKEN -version = 2 url = "http://www.twitter.com/" api = "analyze" -response = diffbot.request(url, token, api, version=version) +response = diffbot.request(url, token, api) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) -print pp.pprint(response) \ No newline at end of file +print pp.pprint(response) + +print "Create a new crawl of http://support.diffbot.com/ using the Article API...\n" +token = API_TOKEN +seeds = "http://support.diffbot.com" +api = "article" +name = "testCrawl" +diffbot = DiffbotCrawl(token,name,seeds,api) +time.sleep(5) +status = diffbot.status() +print "\nPrinting status:\n" +pp = pprint.PrettyPrinter(indent=4) +print pp.pprint(status) +print "\nDeleting test crawl.\n" +diffbot.delete() \ No newline at end of file From ae0b3c162cc475d26eb2067eea77097859e8c17d Mon Sep 17 00:00:00 2001 From: John Davi Date: Fri, 30 May 2014 17:22:45 -0700 Subject: [PATCH 2/3] Updating readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c37c3e5..d8280cb 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ response = diffbot.request(url, token, api, version=version) ``` ###Crawlbot API -An example call to the Crawlbot API: +To start a new crawl, specify a crawl name, seed URLs, and the API via which URLs should be processed. An example call to the Crawlbot API: ``` token = "SOME_TOKEN" @@ -99,7 +99,7 @@ To pass additional arguments to a crawl: ``` sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl,maxToCrawl=100,maxToProcess=50,notifyEmail="support@diffbot.com") ``` - +g ##Testing First install the test requirements with the following command: From 883bceb18fa876d59ec58d60a5b2cc088f831dc5 Mon Sep 17 00:00:00 2001 From: John Davi Date: Fri, 30 May 2014 17:23:15 -0700 Subject: [PATCH 3/3] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d8280cb..8fd18c4 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ To pass additional arguments to a crawl: ``` sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl,maxToCrawl=100,maxToProcess=50,notifyEmail="support@diffbot.com") ``` -g + ##Testing First install the test requirements with the following command: