python - Edit* Having trouble writing a crawler that will scrape a directory of folders of websites and write that info to a .csv -
i've included os.walk in still won't output .csv , i'm not sure it's reading html dir. have directory of offline websites need scrape info (e.g. url, email, name, phone etc) , output .csv. when run (i know it's far being executable,) hangs on line 13 permission denied error.
import os, csv bs4 import beautifulsoup def main(folder, outputfile): open(outputfile, "wb") f: w = csv.writer(f) header = ("tag", "name", "name", "email", "phone", "location", "url") w.writerow(header) rows = crawlhtmls(folder) w.writerows(rows) def crawlhtmls(folder): root, dirs, files in os.walk(folder): f in files: if f.lower().endswith(".html"): soup = beautifulsoup(f.read()) events = soup.findall('div', attrs={'class': 'post'}) headline = x.find('h2') name = x.find('') email = x.find('address') phone = x.find('tel') description = x.find('div', attrs={'class': 'entry'}) headline2 = str(headline) name2 = str(name) email2 = str(name) phone2 = str(phone) description2 = str(description) headline3 = headline2.replace(",", " -") name3 = name2.replace(",", " -") email3 = email2.replace(",", " -") phone3 = phone2.replace(",", " -") description3 = description2.replace(",", " -") headline4 = headline3.replace('<h2 class', "") headline5 = headline4.replace('</h2>', "") headline6 = headline5.replace('- ', "") headline7 = headline6.replace("at ", "") description4 = description3.replace('[<p>', "") description5 = description4.replace('</p>]', "") description6 = description5.replace('\n', " ") description7 = description6.replace('[]', "") link4 = link3.replace('<a href', "") link5 = link4.replace('</a>', "") link6 = link5.replace('h2', " ") link7 = link6.replace('=', "") seq = (headline7, name3, email3, phone3, descripton7) yield seq if __name__ == "__main__": folderpath = r"c:\projects\training\html" output = r"c:\projects\training\about.csv" main(folderpath, output)
Comments
Post a Comment