# -*- coding: utf-8 -*-
from pyquery import PyQuery as pq
from time import sleep
import os
import requests

UA = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36"
url_template = "https://www.google.co.jp/search?q=site:mint.s150.xrea.com&filter=0&start="

search_result_max = 20
entries = []
def retrieve_entry(search_result):
    res = {}
    original_url = search_result("h3.r a").attr("href")
    title = search_result("h3.r a").text()
    cache_url = ""
    for a in search_result.items("a.fl"):
        if "キャッシュ" in a.text():
            cache_url = a.attr("href")
    return [original_url, title, cache_url, ""]

# googleで引っかかるurlを集める
for i in range(search_result_max):
    url = url_template + str(i*10)
    d = pq(url, headers={'user-agent': UA})
    for r in d.items("div.rc"):
        entries.append(retrieve_entry(r))
    sleep(1)

# キャッシュ収集
cache_dir = "cache/"
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

i = 0
for e in entries:
    if e[2] != "":
        res = requests.get(e[2], headers={'user-agent': UA})
        i += 1
        file_path = cache_dir + str(i) + ".html"
        with open(file_path, "wt", encoding="utf8") as f:
            f.write(res.text)
            e[3] = file_path
    sleep(0.5)

# index.html
d = pq("""
    <!doctype html>
    <html>
        <head>
            <meta charset="utf-8" />
        </head>
        <body>
            <ul id="list"></ul>
        </body>
    </html>
    """)
for e in entries:
    if e[3] != "":
        pq('<li><a href="' + e[3] + '">' + e[1] + '</a></li>').appendTo(d("#list"))
with open("index.html", "wt", encoding="utf8") as f:
    f.write(d.outer_html())

# 参考tsv
with open("summary.tsv","wt",encoding="utf8") as f:
    f.write("origina_url\ttitle\tcache_url\tfile_path\n")
    for e in entries:
        f.write("\t".join(e) +"\n")
