initial import

This commit is contained in:
ultra
2016-12-05 21:22:44 -05:00
commit 66cc8b6ad5
17 changed files with 581 additions and 0 deletions

68
alembic.ini Normal file
View File

@@ -0,0 +1,68 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = alembic
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# max length of characters to apply to the
# "slug" field
#truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat alembic/versions
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = sqlite:///hnlearn.db
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

1
alembic/README Normal file
View File

@@ -0,0 +1 @@
Generic single-database configuration.

73
alembic/env.py Normal file
View File

@@ -0,0 +1,73 @@
from __future__ import with_statement
from alembic import context
from sqlalchemy import engine_from_config, pool
from logging.config import fileConfig
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
import sys
sys.path.append(".")
from db import Base
target_metadata = Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix='sqlalchemy.',
poolclass=pool.NullPool)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

BIN
alembic/env.pyc Normal file

Binary file not shown.

24
alembic/script.py.mako Normal file
View File

@@ -0,0 +1,24 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,28 @@
"""added hnid column
Revision ID: 131d925693a4
Revises: d1f648fcd62a
Create Date: 2016-12-05 19:12:17.872699
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '131d925693a4'
down_revision = 'd1f648fcd62a'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('found_items', sa.Column('hnid', sa.String(), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('found_items', 'hnid')
# ### end Alembic commands ###

Binary file not shown.

View File

@@ -0,0 +1,34 @@
"""create initial state
Revision ID: d1f648fcd62a
Revises:
Create Date: 2016-12-05 18:37:48.117437
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'd1f648fcd62a'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
op.create_table('trained_items',
sa.Column('name', sa.String, primary_key=True),
sa.Column('state', sa.Boolean))
op.create_table('found_items',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('name', sa.String),
sa.Column('url', sa.String, unique=True, index=True),
sa.Column('date', sa.DateTime, index=True),
sa.Column('rating', sa.Float))
pass
def downgrade():
op.drop_table('trained_items')
op.drop_table('found_items')

Binary file not shown.

View File

@@ -0,0 +1,25 @@
"""create initial state
Revision ID: d1f648fcd62a
Revises:
Create Date: 2016-12-05 18:37:48.117437
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'd1f648fcd62a'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
pass
def downgrade():
op.drop_table('')

View File

@@ -0,0 +1,28 @@
"""added comment_count column
Revision ID: e07ebf603ff2
Revises: 131d925693a4
Create Date: 2016-12-05 19:49:45.783201
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'e07ebf603ff2'
down_revision = '131d925693a4'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('found_items', sa.Column('comment_count', sa.Integer(), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('found_items', 'comment_count')
# ### end Alembic commands ###

26
db.py Normal file
View File

@@ -0,0 +1,26 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Boolean, Integer, String, DateTime, Float, create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
Base = declarative_base()
class TrainedItem(Base):
__tablename__ = 'trained_items'
name = Column(String, primary_key=True)
state = Column(Boolean)
class FoundItem(Base):
__tablename__ = 'found_items'
id = Column(Integer, primary_key=True)
hnid = Column(String)
comment_count = Column(Integer)
name = Column(String)
url = Column(String, unique=True, index=True)
date = Column(DateTime, index=True)
rating = Column(Float)
engine = create_engine('sqlite:///hnlearn.db')
Base.metadata.create_all(engine)
session_factory = sessionmaker(bind=engine)
Session = scoped_session(session_factory)

26
learn.py Normal file
View File

@@ -0,0 +1,26 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
class Classifier(object):
def __init__(self, datagrabber):
self.grabber = datagrabber
self.reload()
def reload(self):
Xs, Ys = self.grabber()
self.vect = CountVectorizer(analyzer='word',ngram_range=(1,3))
self.train_vec = self.vect.fit_transform(Xs)
self.clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=500, random_state=42)
self.text_clf = self.clf.fit(self.train_vec, Ys)
def scan(self, name):
v = self.vect.transform([name])
return self.text_clf.decision_function(v)[0]
def add(self, name, state):
# implement add using partial_fit
# this would mean switching to hashing vectorizer, which means we can't reverse the model
# so for now we're just going to reload completely
self.reload()

64
views/index.tpl Normal file
View File

@@ -0,0 +1,64 @@
<html>
<head>
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.css"/>
<script type="text/javascript" src="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.js"></script>
<script>
$(document).ready(function()
{
$('#table').DataTable({
"order": [[{{sortby}}, 'desc']],
"paging": false
});
});
</script>
</head>
<body>
<div class="container">
<nav class="navbar navbar-default">
<div class="container-fluid">
<div class="navbar-header">
<a class="navbar-brand" href="#">HNLearn</a>
</div>
<div class="collapse navbar-collapse">
<ul class="nav navbar-nav">
<li><a href="/">Latest > 0</a></li>
<li><a href="/?all=true">Latest with < 0</a></li>
<li><a href="/?limit=day&all=true">Top for Past Day</a></li>
<li><a href="/?limit=week&all=true">Top for Past Week</a></li>
</ul>
</div>
</div>
</nav>
<table id="table" class="table">
<thead>
<tr>
<th>Title</th>
<th>Date</th>
<th>Score</th>
<th>Up/Down</th>
<th>Comments</th>
</tr>
</thead>
<tbody>
%for x in items:
<tr>
<td>
<a href="{{x.url}}">{{x.name}}</a>
</td>
<td>{{x.date.strftime("%Y-%m-%d %H:%M")}}</td>
<td>{{"{0:.4g}".format(x.rating)}}</td>
<td><a href="/rate/{{x.id}}?rating=good">Up</a> / <a href="/rate/{{x.id}}?rating=bad">Down</a></td>
<td>
<a href="https://news.ycombinator.com/?item={{x.hnid}}">
{{x.comment_count}} comments
</a>
</td>
</tr>
%end
</tbody>
</table>
</div>
</body>
</html>

57
views/index.tpl~ Normal file
View File

@@ -0,0 +1,57 @@
<html>
<head>
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.css"/>
<script type="text/javascript" src="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.js"></script>
<script>
$(document).ready(function()
{
$('#table').DataTable({
"order": [[{{sortby}}, 'desc']],
"paging": false
});
});
</script>
</head>
<body>
<div class="container">
<nav class="navbar navbar-default">
<div class="container-fluid">
<div class="navbar-header">
<a class="navbar-brand" href="#">HNLearn</a>
</div>
<div class="collapse navbar-collapse">
<ul class="nav navbar-nav">
<li><a href="/">Latest > 0</a></li>
<li><a href="/?all=true">Latest with < 0</a></li>
<li><a href="/?limit=day&all=true">Top for Past Day</a></li>
<li><a href="/?limit=week&all=true">Top for Past Week</a></li>
</ul>
</div>
</div>
</nav>
<table id="table" class="table">
<thead>
<tr>
<th>Title</th>
<th>Date</th>
<th>Score</th>
<th>Up/Down</th>
</tr>
</thead>
<tbody>
%for x in items:
<tr>
<td><a href="{{x.url}}">{{x.name}}</a></td>
<td>{{x.date.strftime("%Y-%m-%d %H:%M")}}</td>
<td>{{"{0:.4g}".format(x.rating)}}</td>
<td><a href="/rate/{{x.id}}?rating=good">Up</a> / <a href="/rate/{{x.id}}?rating=bad">Down</a></td>
</tr>
%end
</tbody>
</table>
</div>
</body>
</html>

127
web.py Normal file
View File

@@ -0,0 +1,127 @@
from db import *
from learn import *
import datetime
import requests
from bs4 import BeautifulSoup
import bottle
from threading import Thread
import schedule
import time
def grab_all_data():
session = Session()
Xs = []
Ys = []
for name, state in session.query(TrainedItem.name, TrainedItem.state):
Xs.append(name)
Ys.append(state)
return (Xs, Ys)
def import_data():
session = Session()
trained = load_files('data', shuffle=True)
for i in range(len(trained.data)):
item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i]))
session.add(item)
session.commit()
# import_data()
clf = Classifier(grab_all_data)
@bottle.route("/update")
def updateHN():
print "Updating HN..."
session = Session()
resp = requests.get("https://news.ycombinator.com/")
soup = BeautifulSoup(resp.text, "lxml")
for t in soup.find_all('td', align=None, class_='title'):
parent_tr = t.parent
url = t.find('a', class_="storylink")
if url is not None:
url = url["href"]
print parent_tr
hnid = parent_tr["id"]
comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "")
comment_count = 0
try:
comment_count = int(comment_count_text)
except:
print repr(comment_count_text)
pass
check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none()
print url, check
if check is None:
print url
item = FoundItem(name=t.text,
hnid=hnid,
comment_count=comment_count,
url=url,
date=datetime.datetime.now(),
rating=clf.scan(unicode(t.text)))
session.add(item)
else:
check.hnid = hnid
check.comment_count = comment_count
session.commit()
class SchedThread(Thread):
def __init__(self):
Thread.__init__(self)
def run(self):
while True:
schedule.run_pending()
time.sleep(1)
@bottle.route('/')
@bottle.view('index.tpl')
def news():
# load news from DB and display
session = Session()
sortCol = 1
items = session.query(FoundItem)
showUnder = bottle.request.params.get("all") == "true"
if not showUnder:
items = items.filter(FoundItem.rating > 0)
if bottle.request.params.get("limit") == "week":
ago = datetime.datetime.now() - datetime.timedelta(days=7)
items = items.filter(FoundItem.date > ago) # past week
sortCol = 2
elif bottle.request.params.get("limit") == "day":
ago = datetime.datetime.now() - datetime.timedelta(days=1)
items = items.filter(FoundItem.date > ago) # past day
sortCol = 2
else:
items = items.order_by(FoundItem.date.desc()).limit(100)
return dict(items=items, sortby=sortCol)
@bottle.route('/rate/<id:int>')
def rate(id):
session = Session()
rating = bottle.request.params.get('rating') == "good"
item = session.query(FoundItem).filter(FoundItem.id == id).one()
# insert or update
session.merge(TrainedItem(name=item.name, state=rating))
session.commit()
clf.add(item.name, rating)
# re-rate all items in DB
for item in session.query(FoundItem):
item.rating = clf.scan(item.name)
session.commit()
bottle.redirect("/")
if __name__ == "__main__":
schedule.every(10).minutes.do(updateHN)
st = SchedThread()
#st.daemon = True
st.start()
bottle.run(host="0.0.0.0",port=55512)