Crawling Headline News

Check the Website Info

  • Access Developer Tools of the website and enter the Nework tab.

  • Type ctrl + R and enter the Doc tap.

  • Enter a site and check the Headers tap with the site.

    • Copy the value of referer and user-agent.

Crawling Code

  • step01_headlinenews.py

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    import warnings
    import requests
    from bs4 import BeautifulSoup
    warnings.filterwarnings('ignore')
    import pandas as pd

    def crawling(soup):
    div = soup.find("div", class_="list_issue")
    print(type(div))

    titles = []
    urls = []
    for a in div.find_all("a"):
    titles.append(a.get_text())
    urls.append(a['href'])
    results = (titles, urls)
    return(results)

    def df_csv(tp):

    df = pd.DataFrame({"newstitle" : tp[0], "url" : tp[1]})
    print(df)

    df.to_csv("headlinecrawling.csv", index=False)
    print("Crawling is done!")

    def main():
    CUSTOM_HEADER = {
    'referer' : 'https://www.naver.com/',
    'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
    }

    url = 'https://www.naver.com/'
    req = requests.get(url = url, headers=CUSTOM_HEADER)
    print(req.status_code)

    # 200 : Good
    # 404 : URL Error
    # 503 : Server Down

    soup = BeautifulSoup(req.text, 'html.parser', from_encoding='utf-8')
    df_csv(crawling(soup))

    if __name__ == "__main__":
    main()
Share