hnlearn/web.py

from db import *
from learn import *
import datetime

import requests
from bs4 import BeautifulSoup

import bottle
from threading import Thread
import schedule
import time

def grab_all_data():
        session = Session()
        Xs = []
        Ys = []
        for  name, state in session.query(TrainedItem.name, TrainedItem.state):
                Xs.append(name)
                Ys.append(state)
        return (Xs, Ys)

def import_data():
        session = Session()
        trained = load_files('data', shuffle=True)

        for i in range(len(trained.data)):
                item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i]))
                session.add(item)

        session.commit()

# import_data()
clf = Classifier(grab_all_data)

@bottle.route("/update")
def updateHN():
        print "Updating HN..."
        session = Session()
        resp = requests.get("https://news.ycombinator.com/")
        soup = BeautifulSoup(resp.text, "lxml")
        for t in soup.find_all('td', align=None, class_='title'):
                parent_tr = t.parent
                url = t.find('a', class_="storylink")
                if url is not None:
                        url = url["href"]
                        print parent_tr
                        hnid = parent_tr["id"]
                        comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "")
                        comment_count = 0
                        try:
                                comment_count = int(comment_count_text)
                        except:
                                print repr(comment_count_text)
                                pass
                        check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none()
                        print url, check
                        if check is None:
                                print url
                                item = FoundItem(name=t.text,
                                                 hnid=hnid,
                                                 comment_count=comment_count,
                                         url=url,
                                         date=datetime.datetime.now(),
                                         rating=clf.scan(unicode(t.text)))
                                session.add(item)
                        else:
                                check.hnid = hnid
                                check.comment_count = comment_count
        session.commit()

class SchedThread(Thread):
        def __init__(self):
                Thread.__init__(self)

        def run(self):
                while True:
                        schedule.run_pending()
                        time.sleep(1)

@bottle.route('/')
@bottle.view('index.tpl')
def news():
        # load news from DB and display
        session = Session()

        sortCol = 1

        items = session.query(FoundItem)

        showUnder = bottle.request.params.get("all") == "true"
        if not showUnder:
                items = items.filter(FoundItem.rating > 0)

        if bottle.request.params.get("limit") == "week":
                ago = datetime.datetime.now() - datetime.timedelta(days=7)
                items = items.filter(FoundItem.date > ago) # past week
                sortCol = 2
        elif bottle.request.params.get("limit") == "day":
                ago = datetime.datetime.now() - datetime.timedelta(days=1)
                items = items.filter(FoundItem.date > ago) # past day
                sortCol = 2
        else:
                items = items.order_by(FoundItem.date.desc()).limit(100)

        return dict(items=items, sortby=sortCol)

@bottle.route('/rate/<id:int>')
def rate(id):
        session = Session()
        rating = bottle.request.params.get('rating') == "good"
        item = session.query(FoundItem).filter(FoundItem.id == id).one()
        # insert or update
        session.merge(TrainedItem(name=item.name, state=rating))
        session.commit()
        clf.add(item.name, rating)
        # re-rate all items in DB
        for item in session.query(FoundItem):
                item.rating = clf.scan(item.name)
        session.commit()
        bottle.redirect("/")

if __name__ == "__main__":
        schedule.every(10).minutes.do(updateHN)
        st = SchedThread()
        #st.daemon = True
        st.start()
        bottle.run(host="0.0.0.0",port=55512)