Files
hnlearn/web.py
2016-12-05 21:22:44 -05:00

128 lines
4.4 KiB
Python

from db import *
from learn import *
import datetime
import requests
from bs4 import BeautifulSoup
import bottle
from threading import Thread
import schedule
import time
def grab_all_data():
session = Session()
Xs = []
Ys = []
for name, state in session.query(TrainedItem.name, TrainedItem.state):
Xs.append(name)
Ys.append(state)
return (Xs, Ys)
def import_data():
session = Session()
trained = load_files('data', shuffle=True)
for i in range(len(trained.data)):
item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i]))
session.add(item)
session.commit()
# import_data()
clf = Classifier(grab_all_data)
@bottle.route("/update")
def updateHN():
print "Updating HN..."
session = Session()
resp = requests.get("https://news.ycombinator.com/")
soup = BeautifulSoup(resp.text, "lxml")
for t in soup.find_all('td', align=None, class_='title'):
parent_tr = t.parent
url = t.find('a', class_="storylink")
if url is not None:
url = url["href"]
print parent_tr
hnid = parent_tr["id"]
comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "")
comment_count = 0
try:
comment_count = int(comment_count_text)
except:
print repr(comment_count_text)
pass
check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none()
print url, check
if check is None:
print url
item = FoundItem(name=t.text,
hnid=hnid,
comment_count=comment_count,
url=url,
date=datetime.datetime.now(),
rating=clf.scan(unicode(t.text)))
session.add(item)
else:
check.hnid = hnid
check.comment_count = comment_count
session.commit()
class SchedThread(Thread):
def __init__(self):
Thread.__init__(self)
def run(self):
while True:
schedule.run_pending()
time.sleep(1)
@bottle.route('/')
@bottle.view('index.tpl')
def news():
# load news from DB and display
session = Session()
sortCol = 1
items = session.query(FoundItem)
showUnder = bottle.request.params.get("all") == "true"
if not showUnder:
items = items.filter(FoundItem.rating > 0)
if bottle.request.params.get("limit") == "week":
ago = datetime.datetime.now() - datetime.timedelta(days=7)
items = items.filter(FoundItem.date > ago) # past week
sortCol = 2
elif bottle.request.params.get("limit") == "day":
ago = datetime.datetime.now() - datetime.timedelta(days=1)
items = items.filter(FoundItem.date > ago) # past day
sortCol = 2
else:
items = items.order_by(FoundItem.date.desc()).limit(100)
return dict(items=items, sortby=sortCol)
@bottle.route('/rate/<id:int>')
def rate(id):
session = Session()
rating = bottle.request.params.get('rating') == "good"
item = session.query(FoundItem).filter(FoundItem.id == id).one()
# insert or update
session.merge(TrainedItem(name=item.name, state=rating))
session.commit()
clf.add(item.name, rating)
# re-rate all items in DB
for item in session.query(FoundItem):
item.rating = clf.scan(item.name)
session.commit()
bottle.redirect("/")
if __name__ == "__main__":
schedule.every(10).minutes.do(updateHN)
st = SchedThread()
#st.daemon = True
st.start()
bottle.run(host="0.0.0.0",port=55512)