128 lines
4.4 KiB
Python
128 lines
4.4 KiB
Python
from db import *
|
|
from learn import *
|
|
import datetime
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
import bottle
|
|
from threading import Thread
|
|
import schedule
|
|
import time
|
|
|
|
def grab_all_data():
|
|
session = Session()
|
|
Xs = []
|
|
Ys = []
|
|
for name, state in session.query(TrainedItem.name, TrainedItem.state):
|
|
Xs.append(name)
|
|
Ys.append(state)
|
|
return (Xs, Ys)
|
|
|
|
def import_data():
|
|
session = Session()
|
|
trained = load_files('data', shuffle=True)
|
|
|
|
for i in range(len(trained.data)):
|
|
item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i]))
|
|
session.add(item)
|
|
|
|
session.commit()
|
|
|
|
# import_data()
|
|
clf = Classifier(grab_all_data)
|
|
|
|
@bottle.route("/update")
|
|
def updateHN():
|
|
print "Updating HN..."
|
|
session = Session()
|
|
resp = requests.get("https://news.ycombinator.com/")
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
for t in soup.find_all('td', align=None, class_='title'):
|
|
parent_tr = t.parent
|
|
url = t.find('a', class_="storylink")
|
|
if url is not None:
|
|
url = url["href"]
|
|
print parent_tr
|
|
hnid = parent_tr["id"]
|
|
comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "")
|
|
comment_count = 0
|
|
try:
|
|
comment_count = int(comment_count_text)
|
|
except:
|
|
print repr(comment_count_text)
|
|
pass
|
|
check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none()
|
|
print url, check
|
|
if check is None:
|
|
print url
|
|
item = FoundItem(name=t.text,
|
|
hnid=hnid,
|
|
comment_count=comment_count,
|
|
url=url,
|
|
date=datetime.datetime.now(),
|
|
rating=clf.scan(unicode(t.text)))
|
|
session.add(item)
|
|
else:
|
|
check.hnid = hnid
|
|
check.comment_count = comment_count
|
|
session.commit()
|
|
|
|
class SchedThread(Thread):
|
|
def __init__(self):
|
|
Thread.__init__(self)
|
|
|
|
def run(self):
|
|
while True:
|
|
schedule.run_pending()
|
|
time.sleep(1)
|
|
|
|
@bottle.route('/')
|
|
@bottle.view('index.tpl')
|
|
def news():
|
|
# load news from DB and display
|
|
session = Session()
|
|
|
|
sortCol = 1
|
|
|
|
items = session.query(FoundItem)
|
|
|
|
showUnder = bottle.request.params.get("all") == "true"
|
|
if not showUnder:
|
|
items = items.filter(FoundItem.rating > 0)
|
|
|
|
if bottle.request.params.get("limit") == "week":
|
|
ago = datetime.datetime.now() - datetime.timedelta(days=7)
|
|
items = items.filter(FoundItem.date > ago) # past week
|
|
sortCol = 2
|
|
elif bottle.request.params.get("limit") == "day":
|
|
ago = datetime.datetime.now() - datetime.timedelta(days=1)
|
|
items = items.filter(FoundItem.date > ago) # past day
|
|
sortCol = 2
|
|
else:
|
|
items = items.order_by(FoundItem.date.desc()).limit(100)
|
|
|
|
return dict(items=items, sortby=sortCol)
|
|
|
|
@bottle.route('/rate/<id:int>')
|
|
def rate(id):
|
|
session = Session()
|
|
rating = bottle.request.params.get('rating') == "good"
|
|
item = session.query(FoundItem).filter(FoundItem.id == id).one()
|
|
# insert or update
|
|
session.merge(TrainedItem(name=item.name, state=rating))
|
|
session.commit()
|
|
clf.add(item.name, rating)
|
|
# re-rate all items in DB
|
|
for item in session.query(FoundItem):
|
|
item.rating = clf.scan(item.name)
|
|
session.commit()
|
|
bottle.redirect("/")
|
|
|
|
if __name__ == "__main__":
|
|
schedule.every(10).minutes.do(updateHN)
|
|
st = SchedThread()
|
|
#st.daemon = True
|
|
st.start()
|
|
bottle.run(host="0.0.0.0",port=55512)
|