본문 바로가기
Python/퀀트

크롤링 실습: 국내 주식 티커 데이터 크롤링

by 훈영 2024. 11. 5.
# 최근 영업일 크롤링
import requests as rq
from bs4 import BeautifulSoup

url = 'https://finance.naver.com/sise/sise_deposit.naver'
data = rq.get(url)
data_html = BeautifulSoup(data.content)

parse_day = data_html.select_one(
    'div.subtop_sise_graph2 > ul.subtop_chart_note > li > span.tah').text
parse_day


# 날짜 추출
import re

biz_day = re.findall('[0-9]+', parse_day)
biz_day = ''.join(biz_day)
biz_day


import requests as rq
from io import BytesIO
import pandas as pd


# 코스피
gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
gen_otp_stk = {
    'mktId': 'STK',  # 코스피: 'STK' / 코스피: 'KSQ'
    'trdDd': biz_day,
    'money': '1',
    'csvxls_isNo': 'false',
    'name': 'fileDown',
    'url': 'dbms/MDC/STAT/standard/MDCSTAT03901'
    }
headers = {'Referer': 'http://data.krx.co.kr/contents/MDC/MDI/mdiLoader',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}

otp_stk = rq.post(gen_otp_url, gen_otp_stk, headers=headers).text


down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
down_sector_stk = rq.post(down_url, {'code': otp_stk}, headers=headers)

sector_stk = pd.read_csv(BytesIO(down_sector_stk.content), encoding='EUC-KR')
sector_stk


# 코스닥
gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
gen_otp_ksq = {
    'mktId': 'KSQ',  # 코스피: 'STK' / 코스피: 'KSQ'
    'trdDd': biz_day,
    'money': '1',
    'csvxls_isNo': 'false',
    'name': 'fileDown',
    'url': 'dbms/MDC/STAT/standard/MDCSTAT03901'
    }
headers = {'Referer': 'http://data.krx.co.kr/contents/MDC/MDI/mdiLoader',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}

otp_ksq = rq.post(gen_otp_url, gen_otp_ksq, headers=headers).text


down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
down_sector_ksq = rq.post(down_url, {'code': otp_ksq}, headers=headers)

sector_ksq = pd.read_csv(BytesIO(down_sector_ksq.content), encoding='EUC-KR')
sector_ksq


# 데이터프레임 합치기
krx_sector = pd.concat([sector_stk, sector_ksq]).reset_index(drop=True)
krx_sector['종목명'] = krx_sector['종목명'].str.strip()
krx_sector['기준일'] = biz_day


# 개별종목
gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
gen_otp_data = {
    'searchType': '1',
    'mktId': 'ALL',
    'trdDd': biz_day,
    'csvxls_isNo': 'false',
    'name': 'fileDown',
    'url': 'dbms/MDC/STAT/standard/MDCSTAT03501'
    }
headers = {'Referer': 'http://data.krx.co.kr/contents/MDC/MDI/mdiLoader',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}

otp = rq.post(gen_otp_url, gen_otp_data, headers=headers).text

down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
krx_ind = rq.post(down_url, {'code': otp}, headers=headers)

krx_ind = pd.read_csv(BytesIO(krx_ind.content), encoding='EUC-KR')
krx_ind['종목명'] = krx_ind['종목명'].str.strip()
krx_ind['기준일'] = biz_day


# 데이터 합치기
set(krx_sector['종목명']).symmetric_difference(set(krx_ind['종목명']))

kor_ticker = pd.merge(krx_sector,
                      krx_ind,
                      on=krx_sector.columns.intersection(
                          krx_ind.columns).tolist(),
                      how='outer')


# 스팩 종목
kor_ticker[kor_ticker['종목명'].str.contains('스팩|제[0-9]+호')]['종목명']

# 우선주
kor_ticker[kor_ticker['종목코드'].str[-1:] != '0']['종목명']

# 리츠
kor_ticker[kor_ticker['종목명'].str.endswith('리츠')]['종목명']


# 종목 구분
import numpy as np

diff = list(set(krx_sector['종목명']).symmetric_difference(set(krx_ind['종목명'])))

kor_ticker['종목구분'] = np.where(kor_ticker['종목명'].str.contains('스팩|제[0-9]+호'), '스팩',
                              np.where(kor_ticker['종목코드'].str[-1:] != '0', '우선주',
                                       np.where(kor_ticker['종목명'].str.endswith('리츠'), '리츠',
                                                np.where(kor_ticker['종목명'].isin(diff), '기타',
                                                         '보통주'))))

kor_ticker = kor_ticker.reset_index(drop=True)
kor_ticker.columns = kor_ticker.columns.str.replace(' ', '')
kor_ticker = kor_ticker[['종목코드', '종목명', '시장구분', '종가', '시가총액', '기준일', 'EPS', '선행EPS', 'BPS', '주당배당금', '종목구분']]
kor_ticker = kor_ticker.replace({np.nan: None})



# sql DB에 저장
import pymysql

con = pymysql.connect(user='root',
                      passwd='0000',
                      host='127.0.0.1',
                      db='stock_db',
                      charset='utf8')

mycursor = con.cursor()
query = f"""
    insert into kor_ticker (종목코드, 종목명, 시장구분, 종가, 시가총액, 기준일, EPS, 선행EPS, BPS, 주당배당금, 종목구분)
    values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) as new
    on duplicate key update
    종목명=new.종목명, 시장구분=new.시장구분, 종가=new.종가, 시가총액=new.시가총액, EPS=new.EPS, 선행EPS=new.선행EPS,
    BPS=new.BPS, 주당배당금=new.주당배당금, 종목구분=new.종목구분;
    """
    
args = kor_ticker.values.tolist()
mycursor.executemany(query, args)
con.commit()

con.close()

댓글