from bs4 import BeautifulSoup

import requests

import datetime

import csv

import os

import pymysql

import re

 

start_time = datetime.datetime.now()

print('start:'+ str(start_time))

 

today = format(datetime.date.today(),'%Y%m%d')

os.makedirs(today, exist_ok = True)

conn = pymysql.connect(

    host='localhost',

    user='root',

    password='',

    database='kabuka',

)

 

def insert_data_bulk(values):

    insert_sql = "INSERT INTO kigyosearch values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

    cur = conn.cursor()

    cur.execute("TRUNCATE TABLE kigyosearch")

    cur.executemany(insert_sql, values)

    cur.close()

    conn.commit()

    conn.close()

 

def outcsv(code, values):

    with open(today + '.csv', 'w') as f:

        w = csv.writer(f)

        w.writerows(values)

    print(code)

    

def trim(txt):

    return txt.replace('\n','').replace('\t','')

 

def main():

    u = 'https://上場企業サーチ.com/companies/'

    headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15'}

    r = requests.get(u,headers = headers)

    soup = BeautifulSoup(r.content,'html.parser')

    cur = conn.cursor()

    sql = 'SELECT DISTINCT code FROM meigara'

    cur.execute(sql)

    values = []

    for code, in cur:

        try:

            i = 0

            kurl = u + code + '/'

            r = requests.get(kurl, headers = headers)

            soup = BeautifulSoup(r.content,'html.parser')

 

            elem = soup.find('div', class_='company-body row')

            all_dd = elem.find_all('dd')

            name = trim(soup.find('span', class_='current').text)

            shijo = trim(all_dd[0].find_all('td')[0].text)

            jojodate = trim(all_dd[0].find_all('td')[1].text.replace('年','-').replace('月','-').replace('日',''))

            gyoshu = trim(all_dd[1].find('a').text)

            jusho1 = trim(all_dd[4].find('a').text)

            jusho2 = trim(re.sub('.*]','',all_dd[4].find('div').text))

            kessan = trim(all_dd[5].text)

            url = trim(all_dd[6].text)

            jika = trim(all_dd[7].text)

            kaikei1 = trim(all_dd[8].text)

            kaikei2 = trim(all_dd[9].text)

            kaikei3 = trim(all_dd[10].text)

            kaikei4 = trim(all_dd[11].text)

            kaikei5 = trim(all_dd[12].text)

            if '連結開示' in kaikei4:

                jugyoin1 = trim(all_dd[14].text)

                jugyoin2 = trim(all_dd[15].text)

                jugyoin3 = trim(all_dd[16].text)

                jugyoin4 = trim(all_dd[17].text)

                kansa1 = trim(all_dd[18].text)

                kansa2 = trim(all_dd[19].text)

                kansa3 = trim(all_dd[20].text)    

            else:

                jugyoin1 = trim(all_dd[13].text)

                jugyoin2 = trim(all_dd[14].text)

                jugyoin3 = trim(all_dd[15].text)

                jugyoin4 = trim(all_dd[16].text)

                kansa1 = trim(all_dd[17].text)

                kansa2 = trim(all_dd[18].text)

                kansa3 = trim(all_dd[19].text)  

                

            values.append  ([code,name,shijo,jojodate,gyoshu,jusho1,jusho2,kessan,url,jika,kaikei1,kaikei2,kaikei3,kaikei4,kaikei5,jugyoin1,jugyoin2,jugyoin3,jugyoin4,kansa1,kansa2,kansa3])

            print(code)

            

        except Exception as e:

            print (code + ' error')

 

    outcsv(code, values)

    insert_data_bulk(values)

    

    end_time = datetime.datetime.now()

    print('end:'+ str(end_time))

    print(end_time - start_time)

    

if __name__ == "__main__":

    main()