WordPress.com Blog Scraper (Beta)

This is a software concept that I conceived and developed, along with an engineer from Facebook, that allows a person to go to  WordPress.com site and download all posted blogs.  We are the first software developers that have created an application like this for public release, as far as I am aware.

We are in the beta testing phase.  When completed, this will be a stand alone program, with a GUI, that comes in one executable, installs itself along with Python, and allows a user to enter in the name of the blog.  Once the program is started, the scraper will go from post to post, download each one, and store it in a printable PDF.  This is great for documenting the attacks of anonymous bloggers.

While you could also download a whole blog filled with recipes, this application was designed as a counter cyber warfare tool, because activists often become the targets of anonymous bloggers.

The current code for this scraper is as follows:

import json
import os
import shelve
import time
import urllib
import urllib2

from libs import pdfkit

# wp site to scrape
WP_SITE = ‘[insert blog URL here minus brackets]’

# get posts from a wordpress site
POSTS_ENDPOINT_URL = “https://public-api.wordpress.com/rest/v1.1/sites/%s/posts/”

# wkhtmltopdf executable location
WKHTMLTOPDF_LOCATION = os.path.join(os.getcwd(), ‘libs’, ‘wkhtmltopdf.exe’)

# write_location
WRITE_LOCATION = os.path.join(os.getcwd(), ‘pdfs’)

# file to save the timestamp of the last retrieved post
BOOKMARK = os.path.join(WRITE_LOCATION, ‘bookmark’)

# number of post URLs to fetch at a time
POST_COUNT_PER_QUERY = 20

class Status(object):
def __init__(self):
self.total = len([name for name in os.listdir(WRITE_LOCATION) if name.endswith(‘.pdf’)])
self.new = 0

def get_status_string(self):
return ‘%s new pdfs | %s total pdfs’ % (self.new, self.total)

def increment(self):
self.total += 1
self.new += 1

 

def save_bookmark_timestamp(timestamp):
with open(BOOKMARK, ‘w’) as bookmark:
bookmark.write(timestamp)

def read_bookmark_timestamp():
if os.path.exists(BOOKMARK):
with open(BOOKMARK) as bookmark:
return bookmark.read()

def write_pdf_from_url(url, filename, status):
print ‘writing pdf %s (%s)’ % (filename, status.get_status_string())
for i in range(3):
try:
pdfkit.from_url(
url,
os.path.join(WRITE_LOCATION, filename),
configuration=pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_LOCATION),
options={‘quiet’: ”},
)
status.increment()
return
except KeyboardInterrupt:
raise
except:
print ‘try #%s failed on %s, retrying’ % (i+1, filename)

def get_post_pdfs(posts, status):
for post in posts:
write_pdf_from_url(post[‘URL’], ‘%s.pdf’ % post[‘ID’], status)

def get_next_n_posts(number, start_timestamp, status):
params = {‘order’: ‘ASC’, ‘number’: number}
if start_timestamp:
params[‘after’] = start_timestamp
root = get_wp_site_posts(WP_SITE, params)
get_post_pdfs(root[‘posts’], status)
return root[‘posts’][-1][‘date’] if len(root[‘posts’]) > 0 else start_timestamp

def get_wp_site_posts(site, params=None):
wp_url = POSTS_ENDPOINT_URL % site
if params != None:
url_params = urllib.urlencode(params)
wp_url = ‘%s?%s’ % (wp_url, url_params)
response = urllib2.urlopen(wp_url).read()
return json.loads(response)

def main():
try:
status = Status()
bookmark_timestamp = ”
new_bookmark_timestamp = read_bookmark_timestamp()
while(new_bookmark_timestamp != bookmark_timestamp):
bookmark_timestamp = new_bookmark_timestamp
print ‘fetching %s posts after %s’ % (POST_COUNT_PER_QUERY, bookmark_timestamp)
new_bookmark_timestamp = get_next_n_posts(POST_COUNT_PER_QUERY, bookmark_timestamp, status)
save_bookmark_timestamp(new_bookmark_timestamp)
print ‘no new posts found’
finally:
raw_input(‘done’)

if __name__ == ‘__main__’:
main()

Leave a Reply