2018年4月24日火曜日

[Python] Get title and post date of Blogger

Script used for only Blogger (blogspot, like this blog) . Use Selenium and Python.
 
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
target_url = ''### Blogger only, blogspot.jp or...
num_get = 3 ### how many pages to check
result = []
try:
    b = webdriver.Firefox()
    time.sleep(3)
except:
    quit()
b.get(target_url)
wait = WebDriverWait(b, 30).until(
    EC.element_to_be_clickable((By.CLASS_NAME, 'post-title')))
newest_title = b.find_elements_by_class_name('post-title')[0].text
newest_postlink = b.find_elements_by_partial_link_text(newest_title)[0].get_attribute('href')
b.get(newest_postlink)
# get post date and title, move to older page
cnt = 0
while cnt < num_get:
    wait = WebDriverWait(b, 30).until(
        EC.element_to_be_clickable((By.CLASS_NAME, 'date-header')))
    date = b.find_elements_by_class_name('date-header')[0].text
    
    title = b.find_elements_by_class_name('post-title')[0].text
    
    print(date, title)
    result.append([date, title])
    b.find_elements_by_class_name('blog-pager-older-link')[0].click()
        
    cnt += 1
b.close()
If you want to get result with csv
import pandas as pd
pd.DataFrame(result).to_csv('date_title.csv', header=None, index=None) 

0 件のコメント:

コメントを投稿