from db import * from learn import * import datetime import requests from bs4 import BeautifulSoup import bottle from threading import Thread import schedule import time def grab_all_data(): session = Session() Xs = [] Ys = [] for name, state in session.query(TrainedItem.name, TrainedItem.state): Xs.append(name) Ys.append(state) return (Xs, Ys) def import_data(): session = Session() trained = load_files('data', shuffle=True) for i in range(len(trained.data)): item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i])) session.add(item) session.commit() # import_data() clf = Classifier(grab_all_data) def tryUpdateHN(): try: updateHN() except Exception, e: print e @bottle.route("/update") def updateHN(): print "Updating HN..." session = Session() resp = requests.get("https://news.ycombinator.com/") soup = BeautifulSoup(resp.text, "lxml") for t in soup.find_all('td', align=None, class_='title'): parent_tr = t.parent url = t.find('a', class_="storylink") if url is not None: url = url["href"] print parent_tr hnid = parent_tr["id"] comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "") comment_count = 0 try: comment_count = int(comment_count_text) except: print repr(comment_count_text) pass check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none() print url, check if check is None: print url item = FoundItem(name=t.text, hnid=hnid, comment_count=comment_count, url=url, date=datetime.datetime.now(), rating=clf.scan(unicode(t.text))) session.add(item) else: check.hnid = hnid check.comment_count = comment_count session.commit() class SchedThread(Thread): def __init__(self): Thread.__init__(self) def run(self): while True: schedule.run_pending() time.sleep(1) @bottle.route('/') @bottle.view('index.tpl') def news(): # load news from DB and display session = Session() sortCol = 1 items = session.query(FoundItem) showUnder = bottle.request.params.get("all") == "true" if not showUnder: items = items.filter(FoundItem.rating > 0) if bottle.request.params.get("limit") == "week": ago = datetime.datetime.now() - datetime.timedelta(days=7) items = items.filter(FoundItem.date > ago) # past week sortCol = 2 elif bottle.request.params.get("limit") == "day": ago = datetime.datetime.now() - datetime.timedelta(days=1) items = items.filter(FoundItem.date > ago) # past day sortCol = 2 else: items = items.order_by(FoundItem.date.desc()).limit(100) return dict(items=items, sortby=sortCol) @bottle.route('/rate/') def rate(id): session = Session() rating = bottle.request.params.get('rating') == "good" item = session.query(FoundItem).filter(FoundItem.id == id).one() # insert or update session.merge(TrainedItem(name=item.name, state=rating)) session.commit() clf.add(item.name, rating) # re-rate all items in DB for item in session.query(FoundItem): item.rating = clf.scan(item.name) session.commit() bottle.redirect("/") if __name__ == "__main__": schedule.every(10).minutes.do(tryUpdateHN) st = SchedThread() #st.daemon = True st.start() bottle.run(host="0.0.0.0",port=55512)