Site Dump

Note: Very preliminary stuff here


This script will collect all available page meta data and write it as CSV into a local file in a local directory on you hard drive.
# ToDo:
# snippet to replace all (None) in return data with ''
# run double_q for all CSV output (eg title)
# use csv_writer
from operator import itemgetter
import os
from time import sleep
import sys
 
def text(string):
  "Return any string as-is, convert None to 'None'"
  if string:
    return string
  else:
    return '(None)'
 
def escape_crlf(string):
  "Removes line-ends and replaces them with '\\n'"
  single = ''
  for line in string.splitlines():
    single = single + '\\n' + line
  return single[2:]
 
def double_quotes(string):
  "Doubles any double-quotes so that CSV import will show single double-quotes"
  return string.replace('"','""')
 
try:
  import api_parms
  print 'Imported api_parms'
except ImportError:
  print """
To run this script, you must first create a script 'api_parms.py':
 
user_key = 'your-wiki-userid:your-api-key'  # to receive your API key
                               # post to http://www.wikidot.com/forum/t-137525
site = 'your-wikidot.site'     # e. g. site = 'xml-api'
(optional) category = 'cat'    # e. g. category = 'doc' or category = None
 
and--so that it can be imported from any Python user script--save it to
"""
  print os.getcwd()
  sleep(7)
  sys.exit(1)
 
from xmlrpclib import ServerProxy
s = ServerProxy('https://' + api_parms.user_key + '@www.wikidot.com/xml-rpc-api.php')
 
if api_parms.site:
  site = api_parms.site
else:
  site = 'xml.api'
if api_parms.category:
  category = None
else:
  category = api_parms.category
 
sep = ','
quote = '"'
sep1 = quote
sep3 = quote + sep + quote
 
if category:
  print 'Listing pages ' + site + '.wikidot.com/' + category + ':*'
  pages = s.site.pages({'site': site, 'category': category})
else:
  print 'Listing pages ' + site + '.wikidot.com/*'
  pages = s.site.pages({'site': site})
 
print len(pages), 'pages found'
pages.sort(key=itemgetter('name'))
pages.sort(key=itemgetter('category'))
 
csv_name = site
if category:
  csv_name = site + '_' + category
csv = open(csv_name + '.csv', 'w')
csv.write(sep1 \
  + 'site' + sep3 \
  + 'category' + sep3 \
  + 'name' + sep3 \
  + 'full_name' + sep3 \
  + 'title' + sep3 \
  + 'title_shown' + sep3 \
  + 'title_or_unix_name' + sep3 \
  + 'parent_page' + sep3 \
  + 'user_created' + sep3 \
  + 'date_created' + sep3 \
  + 'user_edited' + sep3 \
  + 'date_edited' + sep3 \
  + 'tag_string' + sep1 + '\n')
for page in pages:
  csv.write(sep1 \
  + text(page['site']) + sep3 \
  + text(page['category']) + sep3 \
  + text(page['name']) + sep3 \
  + text(page['full_name']) + sep3 \
  + text(page['title']).encode('cp437','backslashreplace') + sep3 \
  + text(page['title_shown']).encode('cp437','backslashreplace') + sep3 \
  + text(page['title_or_unix_name']).encode('cp437','backslashreplace') + sep3 \
  + text(page['parent_page']) + sep3 \
  + text(page['user_created']) + sep3 \
  + text(page['date_created']) + sep3 \
  + text(page['user_edited']) + sep3 \
  + text(page['date_edited']) + sep3 \
  + text(page['tag_string']).encode('cp437','backslashreplace') + sep1 + '\n')
csv.close()
 
sleep(10)
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License