#!/usr/bin/env python # CraigsList Parser - Developed by acidvegas in Python (https://acid.vegas/random) ''' Random script to parse all the countries, states, cities, & sections/sub-sections on CraigsList ''' import re, time, urllib.request def between(source, start, stop): data = re.compile(start + '(.*?)' + stop, re.IGNORECASE|re.MULTILINE).search(source) return data.group(1) if data else False def get_source(url): source = urllib.request.urlopen(url, timeout=10) charset = source.headers.get_content_charset() return source.read().decode(charset) if charset else source.read().decode() db = {'category':dict(),'subcat':dict()} source = get_source('http://www.craigslist.org/about/sites?lang=en&cc=us') countries = re.findall('

(.*?)

', source, re.IGNORECASE|re.MULTILINE) source = source.replace('\n', '').replace('\r','') main_data = dict() statess = 0 citiess = 0 for country in countries: main_data[country[0].lower()] = dict() data = between(source, '

{1}

'.format(country[0], country[1]),' ') states = re.findall('

(.*?)

', data, re.IGNORECASE|re.MULTILINE) statess += len(states) for state in states: main_data[country[0].lower()][state.lower()] = dict() state_data = between(source, f'

{state}

', '') cities = re.findall('
  • (.*?)
  • ', state_data, re.IGNORECASE|re.MULTILINE) citiess += len(cities) for city in cities: main_data[country[0].lower()][state.lower()][city[1]] = city[0].split('/?')[0] new_source = get_source(city[0].split('/?')[0]) new_source = new_source.replace('\n', '').replace('\r','') categories = re.findall('data-alltitle="all (.*?)" data-cat="(.*?)">', new_source, re.IGNORECASE|re.MULTILINE) for category in categories: db['category'][category[0]] = db['category'][category[0]]+1 if category[0] in db['category'] else 1 if category[0] != 'resumes': cat = category[0].replace(' ','-') category_data = between(new_source, f'

    (.*?)