commit 66cc8b6ad50f6044407d8eb26564e4e02a35257a Author: ultra Date: Mon Dec 5 21:22:44 2016 -0500 initial import diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..d6fac53 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,68 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# max length of characters to apply to the +# "slug" field +#truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; this defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path +# version_locations = %(here)s/bar %(here)s/bat alembic/versions + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = sqlite:///hnlearn.db + + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/README b/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..9e5a106 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,73 @@ +from __future__ import with_statement +from alembic import context +from sqlalchemy import engine_from_config, pool +from logging.config import fileConfig + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +import sys +sys.path.append(".") +from db import Base +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, target_metadata=target_metadata, literal_binds=True) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix='sqlalchemy.', + poolclass=pool.NullPool) + + with connectable.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/env.pyc b/alembic/env.pyc new file mode 100644 index 0000000..3bd2c4e Binary files /dev/null and b/alembic/env.pyc differ diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..2c01563 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/131d925693a4_added_hnid_column.py b/alembic/versions/131d925693a4_added_hnid_column.py new file mode 100644 index 0000000..d7c3db6 --- /dev/null +++ b/alembic/versions/131d925693a4_added_hnid_column.py @@ -0,0 +1,28 @@ +"""added hnid column + +Revision ID: 131d925693a4 +Revises: d1f648fcd62a +Create Date: 2016-12-05 19:12:17.872699 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '131d925693a4' +down_revision = 'd1f648fcd62a' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('found_items', sa.Column('hnid', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('found_items', 'hnid') + # ### end Alembic commands ### diff --git a/alembic/versions/131d925693a4_added_hnid_column.pyc b/alembic/versions/131d925693a4_added_hnid_column.pyc new file mode 100644 index 0000000..056dd4b Binary files /dev/null and b/alembic/versions/131d925693a4_added_hnid_column.pyc differ diff --git a/alembic/versions/d1f648fcd62a_create_initial_state.py b/alembic/versions/d1f648fcd62a_create_initial_state.py new file mode 100644 index 0000000..b39e9cb --- /dev/null +++ b/alembic/versions/d1f648fcd62a_create_initial_state.py @@ -0,0 +1,34 @@ +"""create initial state + +Revision ID: d1f648fcd62a +Revises: +Create Date: 2016-12-05 18:37:48.117437 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'd1f648fcd62a' +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_table('trained_items', + sa.Column('name', sa.String, primary_key=True), + sa.Column('state', sa.Boolean)) + op.create_table('found_items', + sa.Column('id', sa.Integer, primary_key=True), + sa.Column('name', sa.String), + sa.Column('url', sa.String, unique=True, index=True), + sa.Column('date', sa.DateTime, index=True), + sa.Column('rating', sa.Float)) + pass + + +def downgrade(): + op.drop_table('trained_items') + op.drop_table('found_items') diff --git a/alembic/versions/d1f648fcd62a_create_initial_state.pyc b/alembic/versions/d1f648fcd62a_create_initial_state.pyc new file mode 100644 index 0000000..288fc2f Binary files /dev/null and b/alembic/versions/d1f648fcd62a_create_initial_state.pyc differ diff --git a/alembic/versions/d1f648fcd62a_create_initial_state.py~ b/alembic/versions/d1f648fcd62a_create_initial_state.py~ new file mode 100644 index 0000000..ec1113c --- /dev/null +++ b/alembic/versions/d1f648fcd62a_create_initial_state.py~ @@ -0,0 +1,25 @@ +"""create initial state + +Revision ID: d1f648fcd62a +Revises: +Create Date: 2016-12-05 18:37:48.117437 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'd1f648fcd62a' +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + + pass + + +def downgrade(): + op.drop_table('') diff --git a/alembic/versions/e07ebf603ff2_added_comment_count_column.py b/alembic/versions/e07ebf603ff2_added_comment_count_column.py new file mode 100644 index 0000000..f7952e1 --- /dev/null +++ b/alembic/versions/e07ebf603ff2_added_comment_count_column.py @@ -0,0 +1,28 @@ +"""added comment_count column + +Revision ID: e07ebf603ff2 +Revises: 131d925693a4 +Create Date: 2016-12-05 19:49:45.783201 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'e07ebf603ff2' +down_revision = '131d925693a4' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('found_items', sa.Column('comment_count', sa.Integer(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('found_items', 'comment_count') + # ### end Alembic commands ### diff --git a/alembic/versions/e07ebf603ff2_added_comment_count_column.pyc b/alembic/versions/e07ebf603ff2_added_comment_count_column.pyc new file mode 100644 index 0000000..b952437 Binary files /dev/null and b/alembic/versions/e07ebf603ff2_added_comment_count_column.pyc differ diff --git a/db.py b/db.py new file mode 100644 index 0000000..091ccc6 --- /dev/null +++ b/db.py @@ -0,0 +1,26 @@ +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import Column, Boolean, Integer, String, DateTime, Float, create_engine +from sqlalchemy.orm import sessionmaker, scoped_session + + +Base = declarative_base() + +class TrainedItem(Base): + __tablename__ = 'trained_items' + name = Column(String, primary_key=True) + state = Column(Boolean) + +class FoundItem(Base): + __tablename__ = 'found_items' + id = Column(Integer, primary_key=True) + hnid = Column(String) + comment_count = Column(Integer) + name = Column(String) + url = Column(String, unique=True, index=True) + date = Column(DateTime, index=True) + rating = Column(Float) + +engine = create_engine('sqlite:///hnlearn.db') +Base.metadata.create_all(engine) +session_factory = sessionmaker(bind=engine) +Session = scoped_session(session_factory) diff --git a/learn.py b/learn.py new file mode 100644 index 0000000..4b11d96 --- /dev/null +++ b/learn.py @@ -0,0 +1,26 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import SGDClassifier + +class Classifier(object): + def __init__(self, datagrabber): + self.grabber = datagrabber + self.reload() + + def reload(self): + Xs, Ys = self.grabber() + + self.vect = CountVectorizer(analyzer='word',ngram_range=(1,3)) + self.train_vec = self.vect.fit_transform(Xs) + + self.clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=500, random_state=42) + self.text_clf = self.clf.fit(self.train_vec, Ys) + + def scan(self, name): + v = self.vect.transform([name]) + return self.text_clf.decision_function(v)[0] + + def add(self, name, state): + # implement add using partial_fit + # this would mean switching to hashing vectorizer, which means we can't reverse the model + # so for now we're just going to reload completely + self.reload() diff --git a/views/index.tpl b/views/index.tpl new file mode 100644 index 0000000..859fd6e --- /dev/null +++ b/views/index.tpl @@ -0,0 +1,64 @@ + + + + + + + + +
+ + + + + + + + + + + + +%for x in items: + + + + + + + +%end + +
TitleDateScoreUp/DownComments
+ {{x.name}} + {{x.date.strftime("%Y-%m-%d %H:%M")}}{{"{0:.4g}".format(x.rating)}}Up / Down + + {{x.comment_count}} comments + +
+
+ + diff --git a/views/index.tpl~ b/views/index.tpl~ new file mode 100644 index 0000000..2cb77e8 --- /dev/null +++ b/views/index.tpl~ @@ -0,0 +1,57 @@ + + + + + + + + +
+ + + + + + + + + + + +%for x in items: + + + + + + + +%end + +
TitleDateScoreUp/Down
{{x.name}}{{x.date.strftime("%Y-%m-%d %H:%M")}}{{"{0:.4g}".format(x.rating)}}Up / Down
+
+ + \ No newline at end of file diff --git a/web.py b/web.py new file mode 100644 index 0000000..bead3be --- /dev/null +++ b/web.py @@ -0,0 +1,127 @@ +from db import * +from learn import * +import datetime + +import requests +from bs4 import BeautifulSoup + +import bottle +from threading import Thread +import schedule +import time + +def grab_all_data(): + session = Session() + Xs = [] + Ys = [] + for name, state in session.query(TrainedItem.name, TrainedItem.state): + Xs.append(name) + Ys.append(state) + return (Xs, Ys) + +def import_data(): + session = Session() + trained = load_files('data', shuffle=True) + + for i in range(len(trained.data)): + item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i])) + session.add(item) + + session.commit() + +# import_data() +clf = Classifier(grab_all_data) + +@bottle.route("/update") +def updateHN(): + print "Updating HN..." + session = Session() + resp = requests.get("https://news.ycombinator.com/") + soup = BeautifulSoup(resp.text, "lxml") + for t in soup.find_all('td', align=None, class_='title'): + parent_tr = t.parent + url = t.find('a', class_="storylink") + if url is not None: + url = url["href"] + print parent_tr + hnid = parent_tr["id"] + comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "") + comment_count = 0 + try: + comment_count = int(comment_count_text) + except: + print repr(comment_count_text) + pass + check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none() + print url, check + if check is None: + print url + item = FoundItem(name=t.text, + hnid=hnid, + comment_count=comment_count, + url=url, + date=datetime.datetime.now(), + rating=clf.scan(unicode(t.text))) + session.add(item) + else: + check.hnid = hnid + check.comment_count = comment_count + session.commit() + +class SchedThread(Thread): + def __init__(self): + Thread.__init__(self) + + def run(self): + while True: + schedule.run_pending() + time.sleep(1) + +@bottle.route('/') +@bottle.view('index.tpl') +def news(): + # load news from DB and display + session = Session() + + sortCol = 1 + + items = session.query(FoundItem) + + showUnder = bottle.request.params.get("all") == "true" + if not showUnder: + items = items.filter(FoundItem.rating > 0) + + if bottle.request.params.get("limit") == "week": + ago = datetime.datetime.now() - datetime.timedelta(days=7) + items = items.filter(FoundItem.date > ago) # past week + sortCol = 2 + elif bottle.request.params.get("limit") == "day": + ago = datetime.datetime.now() - datetime.timedelta(days=1) + items = items.filter(FoundItem.date > ago) # past day + sortCol = 2 + else: + items = items.order_by(FoundItem.date.desc()).limit(100) + + return dict(items=items, sortby=sortCol) + +@bottle.route('/rate/') +def rate(id): + session = Session() + rating = bottle.request.params.get('rating') == "good" + item = session.query(FoundItem).filter(FoundItem.id == id).one() + # insert or update + session.merge(TrainedItem(name=item.name, state=rating)) + session.commit() + clf.add(item.name, rating) + # re-rate all items in DB + for item in session.query(FoundItem): + item.rating = clf.scan(item.name) + session.commit() + bottle.redirect("/") + +if __name__ == "__main__": + schedule.every(10).minutes.do(updateHN) + st = SchedThread() + #st.daemon = True + st.start() + bottle.run(host="0.0.0.0",port=55512)