diff --git a/.gitignore b/.gitignore index bea68953..e66fd447 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.pyc goose.egg-info/ build/ +author.py dist/ .DS_Store* ._.DS_Store* diff --git a/goose/article.py b/goose/article.py index d195f166..2f916a10 100644 --- a/goose/article.py +++ b/goose/article.py @@ -28,6 +28,9 @@ def __init__(self): # title of the article self.title = None + #Author of the Article + self.author = None + # stores the lovely, pure text from the article, # stripped of html, formatting, etc... # just raw text with paragraphs separated by newlines. diff --git a/goose/crawler.py b/goose/crawler.py index 211d410e..68ffd594 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -99,6 +99,7 @@ def crawl(self, crawl_candidate): # self.article.publish_date = config.publishDateExtractor.extract(doc) # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) self.article.title = self.extractor.get_title() + self.article.author = self.extractor.get_author() self.article.meta_lang = self.extractor.get_meta_lang() self.article.meta_favicon = self.extractor.get_favicon() self.article.meta_description = self.extractor.get_meta_description() diff --git a/goose/extractors.py b/goose/extractors.py index 1c8a37f1..2999cd16 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -59,6 +59,49 @@ def __init__(self, config, article): # stopwords class self.stopwords_class = config.stopwords_class + def get_author(self): + import re + author = '' + doc = self.article.doc + + #Look at meta tags first + author = self.get_meta_author() + + if (author != None) & (author != ''): + return author + + author_element = self.parser.getElementsByTag(doc, tag='a' , attr = 'href') + + # no author found + if author_element is None or len(author_element) == 0: + return author + + for a in author_element: + a_href = self.parser.getAttribute(a,'href') + if a_href is not None: + #find url that has author, people or users + if re.search('.author.', a_href) is not None: + author = self.clean_author(a_href) + break + elif re.search('.people.', a_href) is not None: + author = self.clean_author(a_href) + elif re.search('.users.', a_href) is not None: + author = self.clean_author(a_href) + return author + + def clean_author(self,a_href): + author_parts = a_href.split('/') + + for a_p in author_parts: + a = author_parts.pop() + if a != '': + author = a + break + + author = author.replace('-' , ' ').title() + + return author + def get_title(self): """\ Fetch the article title and analyze it @@ -178,6 +221,12 @@ def get_meta_description(self): """ return self.get_meta_content(self.article.doc, "meta[name=description]") + def get_meta_author(self): + """\ + if the article has meta Author set in the source, use that + """ + return self.get_meta_content(self.article.doc, "meta[name=Author]") + def get_meta_keywords(self): """\ if the article has meta keywords set in the source, use that