#!/usr/bin/env python
# CraigsList Parser - Developed by acidvegas in Python (https://acid.vegas/random)
'''
Random script to parse all the countries, states, cities, & sections/sub-sections on CraigsList
Dont know what I am doing with this yet...
'''
import re, time, urllib.request
def between(source, start, stop):
data = re.compile(start + '(.*?)' + stop, re.IGNORECASE|re.MULTILINE).search(source)
return data.group(1) if data else False
def get_source(url):
source = urllib.request.urlopen(url, timeout=10)
charset = source.headers.get_content_charset()
return source.read().decode(charset) if charset else source.read().decode()
db = {'category':dict(),'subcat':dict()}
source = get_source('http://www.craigslist.org/about/sites?lang=en&cc=us')
countries = re.findall('
(.*?)
', source, re.IGNORECASE|re.MULTILINE)
source = source.replace('\n', '').replace('\r','')
main_data = dict()
statess = 0
citiess = 0
for country in countries:
main_data[country[0].lower()] = dict()
data = between(source, '{1}
'.format(country[0], country[1]),' ')
states = re.findall('(.*?)
', data, re.IGNORECASE|re.MULTILINE)
statess += len(states)
for state in states:
main_data[country[0].lower()][state.lower()] = dict()
state_data = between(source, f'{state}
', '')
cities = re.findall('(.*?)', state_data, re.IGNORECASE|re.MULTILINE)
citiess += len(cities)
for city in cities:
main_data[country[0].lower()][state.lower()][city[1]] = city[0].split('/?')[0]
new_source = get_source(city[0].split('/?')[0])
new_source = new_source.replace('\n', '').replace('\r','')
categories = re.findall('data-alltitle="all (.*?)" data-cat="(.*?)">', new_source, re.IGNORECASE|re.MULTILINE)
for category in categories:
db['category'][category[0]] = db['category'][category[0]]+1 if category[0] in db['category'] else 1
if category[0] != 'resumes':
cat = category[0].replace(' ','-')
category_data = between(new_source, f'