from bs4 import BeautifulSoup
import requests
import datetime
import csv
import os
import pymysql
import re
start_time = datetime.datetime.now()
print('start:'+ str(start_time))
today = format(datetime.date.today(),'%Y%m%d')
os.makedirs(today, exist_ok = True)
conn = pymysql.connect(
host='localhost',
user='root',
password='',
database='kabuka',
)
def insert_data_bulk(values):
insert_sql = "INSERT INTO kigyosearch values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
cur = conn.cursor()
cur.execute("TRUNCATE TABLE kigyosearch")
cur.executemany(insert_sql, values)
cur.close()
conn.commit()
conn.close()
def outcsv(code, values):
with open(today + '.csv', 'w') as f:
w = csv.writer(f)
w.writerows(values)
print(code)
def trim(txt):
return txt.replace('\n','').replace('\t','')
def main():
u = 'https://上場企業サーチ.com/companies/'
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15'}
r = requests.get(u,headers = headers)
soup = BeautifulSoup(r.content,'html.parser')
cur = conn.cursor()
sql = 'SELECT DISTINCT code FROM meigara'
cur.execute(sql)
values = []
for code, in cur:
try:
i = 0
kurl = u + code + '/'
r = requests.get(kurl, headers = headers)
soup = BeautifulSoup(r.content,'html.parser')
elem = soup.find('div', class_='company-body row')
all_dd = elem.find_all('dd')
name = trim(soup.find('span', class_='current').text)
shijo = trim(all_dd[0].find_all('td')[0].text)
jojodate = trim(all_dd[0].find_all('td')[1].text.replace('年','-').replace('月','-').replace('日',''))
gyoshu = trim(all_dd[1].find('a').text)
jusho1 = trim(all_dd[4].find('a').text)
jusho2 = trim(re.sub('.*]','',all_dd[4].find('div').text))
kessan = trim(all_dd[5].text)
url = trim(all_dd[6].text)
jika = trim(all_dd[7].text)
kaikei1 = trim(all_dd[8].text)
kaikei2 = trim(all_dd[9].text)
kaikei3 = trim(all_dd[10].text)
kaikei4 = trim(all_dd[11].text)
kaikei5 = trim(all_dd[12].text)
if '連結開示' in kaikei4:
jugyoin1 = trim(all_dd[14].text)
jugyoin2 = trim(all_dd[15].text)
jugyoin3 = trim(all_dd[16].text)
jugyoin4 = trim(all_dd[17].text)
kansa1 = trim(all_dd[18].text)
kansa2 = trim(all_dd[19].text)
kansa3 = trim(all_dd[20].text)
else:
jugyoin1 = trim(all_dd[13].text)
jugyoin2 = trim(all_dd[14].text)
jugyoin3 = trim(all_dd[15].text)
jugyoin4 = trim(all_dd[16].text)
kansa1 = trim(all_dd[17].text)
kansa2 = trim(all_dd[18].text)
kansa3 = trim(all_dd[19].text)
values.append ([code,name,shijo,jojodate,gyoshu,jusho1,jusho2,kessan,url,jika,kaikei1,kaikei2,kaikei3,kaikei4,kaikei5,jugyoin1,jugyoin2,jugyoin3,jugyoin4,kansa1,kansa2,kansa3])
print(code)
except Exception as e:
print (code + ' error')
outcsv(code, values)
insert_data_bulk(values)
end_time = datetime.datetime.now()
print('end:'+ str(end_time))
print(end_time - start_time)
if __name__ == "__main__":
main()