initial import

2016-12-05 21:22:44 -05:00
commit 66cc8b6ad5
17 changed files with 581 additions and 0 deletions
--- a/alembic.ini
+++ b/alembic.ini
@@ -0,0 +1,68 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration files
+# file_template = %%(rev)s_%%(slug)s
+
+# max length of characters to apply to the
+# "slug" field
+#truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; this defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path
+# version_locations = %(here)s/bar %(here)s/bat alembic/versions
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = sqlite:///hnlearn.db
+
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/alembic/README
+++ b/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -0,0 +1,73 @@
+from __future__ import with_statement
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+from logging.config import fileConfig
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+import sys
+sys.path.append(".")
+from db import Base
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline():
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url, target_metadata=target_metadata, literal_binds=True)
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online():
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section),
+        prefix='sqlalchemy.',
+        poolclass=pool.NullPool)
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/alembic/env.pyc
+++ b/alembic/env.pyc
--- a/alembic/script.py.mako
+++ b/alembic/script.py.mako
@@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade():
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade():
+    ${downgrades if downgrades else "pass"}
--- a/alembic/versions/131d925693a4_added_hnid_column.py
+++ b/alembic/versions/131d925693a4_added_hnid_column.py
@@ -0,0 +1,28 @@
+"""added hnid column
+
+Revision ID: 131d925693a4
+Revises: d1f648fcd62a
+Create Date: 2016-12-05 19:12:17.872699
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '131d925693a4'
+down_revision = 'd1f648fcd62a'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('found_items', sa.Column('hnid', sa.String(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('found_items', 'hnid')
+    # ### end Alembic commands ###
--- a/alembic/versions/131d925693a4_added_hnid_column.pyc
+++ b/alembic/versions/131d925693a4_added_hnid_column.pyc
--- a/alembic/versions/d1f648fcd62a_create_initial_state.py
+++ b/alembic/versions/d1f648fcd62a_create_initial_state.py
@@ -0,0 +1,34 @@
+"""create initial state
+
+Revision ID: d1f648fcd62a
+Revises: 
+Create Date: 2016-12-05 18:37:48.117437
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'd1f648fcd62a'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.create_table('trained_items',
+                    sa.Column('name', sa.String, primary_key=True),
+                    sa.Column('state', sa.Boolean))
+    op.create_table('found_items',
+                    sa.Column('id', sa.Integer, primary_key=True),
+                    sa.Column('name', sa.String),
+                    sa.Column('url', sa.String, unique=True, index=True),
+                    sa.Column('date', sa.DateTime, index=True),
+                    sa.Column('rating', sa.Float))
+    pass
+
+
+def downgrade():
+    op.drop_table('trained_items')
+    op.drop_table('found_items')
--- a/alembic/versions/d1f648fcd62a_create_initial_state.pyc
+++ b/alembic/versions/d1f648fcd62a_create_initial_state.pyc
--- a/alembic/versions/d1f648fcd62a_create_initial_state.py~
+++ b/alembic/versions/d1f648fcd62a_create_initial_state.py~
@@ -0,0 +1,25 @@
+"""create initial state
+
+Revision ID: d1f648fcd62a
+Revises: 
+Create Date: 2016-12-05 18:37:48.117437
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'd1f648fcd62a'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    
+    pass
+
+
+def downgrade():
+    op.drop_table('')
--- a/alembic/versions/e07ebf603ff2_added_comment_count_column.py
+++ b/alembic/versions/e07ebf603ff2_added_comment_count_column.py
@@ -0,0 +1,28 @@
+"""added comment_count column
+
+Revision ID: e07ebf603ff2
+Revises: 131d925693a4
+Create Date: 2016-12-05 19:49:45.783201
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'e07ebf603ff2'
+down_revision = '131d925693a4'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('found_items', sa.Column('comment_count', sa.Integer(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('found_items', 'comment_count')
+    # ### end Alembic commands ###
--- a/alembic/versions/e07ebf603ff2_added_comment_count_column.pyc
+++ b/alembic/versions/e07ebf603ff2_added_comment_count_column.pyc
--- a/db.py
+++ b/db.py
@@ -0,0 +1,26 @@
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Boolean, Integer, String, DateTime, Float, create_engine
+from sqlalchemy.orm import sessionmaker, scoped_session
+
+
+Base = declarative_base()
+
+class TrainedItem(Base):
+	__tablename__ = 'trained_items'
+	name = Column(String, primary_key=True)
+	state = Column(Boolean)
+
+class FoundItem(Base):
+        __tablename__ = 'found_items'
+        id = Column(Integer, primary_key=True)
+        hnid = Column(String)
+        comment_count = Column(Integer)
+        name = Column(String)
+        url = Column(String, unique=True, index=True)
+        date = Column(DateTime, index=True)
+        rating = Column(Float)
+
+engine = create_engine('sqlite:///hnlearn.db')
+Base.metadata.create_all(engine)
+session_factory = sessionmaker(bind=engine)
+Session = scoped_session(session_factory)
--- a/learn.py
+++ b/learn.py
@@ -0,0 +1,26 @@
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import SGDClassifier
+
+class Classifier(object):
+        def __init__(self, datagrabber):
+                self.grabber = datagrabber
+                self.reload()
+                
+        def reload(self):
+                Xs, Ys = self.grabber()
+
+                self.vect = CountVectorizer(analyzer='word',ngram_range=(1,3))
+                self.train_vec = self.vect.fit_transform(Xs)
+                
+                self.clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=500, random_state=42)
+                self.text_clf = self.clf.fit(self.train_vec, Ys)
+
+        def scan(self, name):
+                v = self.vect.transform([name])
+                return self.text_clf.decision_function(v)[0]
+
+        def add(self, name, state):
+                # implement add using partial_fit
+                # this would mean switching to hashing vectorizer, which means we can't reverse the model
+                # so for now we're just going to reload completely
+                self.reload()
--- a/views/index.tpl
+++ b/views/index.tpl
@@ -0,0 +1,64 @@
+<html>
+<head>
+	<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.css"/>
+
+	<script type="text/javascript" src="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.js"></script>
+	<script>
+	$(document).ready(function()
+	{
+		$('#table').DataTable({
+			"order": [[{{sortby}}, 'desc']],
+			"paging": false
+		});
+	});
+	</script>
+</head>
+<body>
+<div class="container">
+<nav class="navbar navbar-default">
+     <div class="container-fluid">
+          <div class="navbar-header">
+	       <a class="navbar-brand" href="#">HNLearn</a>
+	  </div>
+
+	  <div class="collapse navbar-collapse">
+	       <ul class="nav navbar-nav">
+	       	   <li><a href="/">Latest > 0</a></li>
+		   <li><a href="/?all=true">Latest with < 0</a></li>
+		   <li><a href="/?limit=day&all=true">Top for Past Day</a></li>
+		   <li><a href="/?limit=week&all=true">Top for Past Week</a></li>
+	       </ul>
+	  </div>
+     </div>
+</nav>
+<table id="table" class="table">
+        <thead>
+	        <tr>
+		  <th>Title</th>
+		  <th>Date</th>
+		  <th>Score</th>
+		  <th>Up/Down</th>
+		  <th>Comments</th>
+	        </tr>
+	</thead>
+	<tbody>
+%for x in items:
+	<tr>
+	  <td>
+	    <a href="{{x.url}}">{{x.name}}</a>
+	  </td>
+	  <td>{{x.date.strftime("%Y-%m-%d %H:%M")}}</td>
+	  <td>{{"{0:.4g}".format(x.rating)}}</td>
+	  <td><a href="/rate/{{x.id}}?rating=good">Up</a> / <a href="/rate/{{x.id}}?rating=bad">Down</a></td>
+	  <td>
+	    <a href="https://news.ycombinator.com/?item={{x.hnid}}">
+	      {{x.comment_count}} comments
+	    </a>
+	  </td>		
+	</tr>
+%end
+	</tbody>
+</table>
+</div>
+</body>
+</html>
--- a/views/index.tpl~
+++ b/views/index.tpl~
@@ -0,0 +1,57 @@
+<html>
+<head>
+	<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.css"/>
+
+	<script type="text/javascript" src="https://cdn.datatables.net/v/bs-3.3.6/jq-2.2.3/dt-1.10.12/datatables.js"></script>
+	<script>
+	$(document).ready(function()
+	{
+		$('#table').DataTable({
+			"order": [[{{sortby}}, 'desc']],
+			"paging": false
+		});
+	});
+	</script>
+</head>
+<body>
+<div class="container">
+<nav class="navbar navbar-default">
+     <div class="container-fluid">
+          <div class="navbar-header">
+	       <a class="navbar-brand" href="#">HNLearn</a>
+	  </div>
+
+	  <div class="collapse navbar-collapse">
+	       <ul class="nav navbar-nav">
+	       	   <li><a href="/">Latest > 0</a></li>
+		   <li><a href="/?all=true">Latest with < 0</a></li>
+		   <li><a href="/?limit=day&all=true">Top for Past Day</a></li>
+		   <li><a href="/?limit=week&all=true">Top for Past Week</a></li>
+	       </ul>
+	  </div>
+     </div>
+</nav>
+<table id="table" class="table">
+        <thead>
+	        <tr>
+			<th>Title</th>
+			<th>Date</th>
+			<th>Score</th>
+			<th>Up/Down</th>
+	        </tr>
+	</thead>
+	<tbody>
+%for x in items:
+	<tr>
+		<td><a href="{{x.url}}">{{x.name}}</a></td>
+		<td>{{x.date.strftime("%Y-%m-%d %H:%M")}}</td>
+		<td>{{"{0:.4g}".format(x.rating)}}</td>
+		<td><a href="/rate/{{x.id}}?rating=good">Up</a> / <a href="/rate/{{x.id}}?rating=bad">Down</a></td>
+		
+	</tr>
+%end
+	</tbody>
+</table>
+</div>
+</body>
+</html>
--- a/web.py
+++ b/web.py
@@ -0,0 +1,127 @@
+from db import *
+from learn import *
+import datetime
+
+import requests
+from bs4 import BeautifulSoup
+
+import bottle
+from threading import Thread
+import schedule
+import time
+
+def grab_all_data():
+        session = Session()
+        Xs = []
+        Ys = []
+        for  name, state in session.query(TrainedItem.name, TrainedItem.state):
+                Xs.append(name)
+                Ys.append(state)
+        return (Xs, Ys)
+
+def import_data():
+        session = Session()
+        trained = load_files('data', shuffle=True)
+        
+        for i in range(len(trained.data)):
+                item = TrainedItem(name=trained.data[i].decode('utf8'), state=bool(trained.target[i]))
+                session.add(item)
+                
+        session.commit()
+
+# import_data()
+clf = Classifier(grab_all_data)
+
+@bottle.route("/update")
+def updateHN():
+        print "Updating HN..."
+        session = Session()
+        resp = requests.get("https://news.ycombinator.com/")
+        soup = BeautifulSoup(resp.text, "lxml")
+        for t in soup.find_all('td', align=None, class_='title'):
+                parent_tr = t.parent
+                url = t.find('a', class_="storylink")
+                if url is not None:
+                        url = url["href"]
+                        print parent_tr
+                        hnid = parent_tr["id"]
+                        comment_count_text = unicode(parent_tr.next_sibling.find_all('a', href="item?id=" + hnid)[-1].text).replace(u"\xa0comments", "")
+                        comment_count = 0
+                        try:
+                                comment_count = int(comment_count_text)
+                        except:
+                                print repr(comment_count_text)
+                                pass
+                        check = session.query(FoundItem).filter(FoundItem.url == url).one_or_none()
+                        print url, check
+                        if check is None:
+                                print url
+                                item = FoundItem(name=t.text,
+                                                 hnid=hnid,
+                                                 comment_count=comment_count,
+                                         url=url,
+                                         date=datetime.datetime.now(),
+                                         rating=clf.scan(unicode(t.text)))
+                                session.add(item)
+                        else:
+                                check.hnid = hnid
+                                check.comment_count = comment_count
+        session.commit()
+
+class SchedThread(Thread):
+        def __init__(self):
+                Thread.__init__(self)
+
+        def run(self):
+                while True:
+                        schedule.run_pending()
+                        time.sleep(1)
+        
+@bottle.route('/')
+@bottle.view('index.tpl')
+def news():
+        # load news from DB and display
+        session = Session()
+        
+        sortCol = 1
+        
+        items = session.query(FoundItem)
+        
+        showUnder = bottle.request.params.get("all") == "true"
+        if not showUnder:
+                items = items.filter(FoundItem.rating > 0)
+        
+        if bottle.request.params.get("limit") == "week":
+                ago = datetime.datetime.now() - datetime.timedelta(days=7)
+                items = items.filter(FoundItem.date > ago) # past week
+                sortCol = 2
+        elif bottle.request.params.get("limit") == "day":
+                ago = datetime.datetime.now() - datetime.timedelta(days=1)
+                items = items.filter(FoundItem.date > ago) # past day
+                sortCol = 2
+        else:
+                items = items.order_by(FoundItem.date.desc()).limit(100)
+
+        return dict(items=items, sortby=sortCol)
+
+@bottle.route('/rate/<id:int>')
+def rate(id):
+        session = Session()
+        rating = bottle.request.params.get('rating') == "good"
+        item = session.query(FoundItem).filter(FoundItem.id == id).one()
+        # insert or update
+        session.merge(TrainedItem(name=item.name, state=rating))
+        session.commit()
+        clf.add(item.name, rating)
+        # re-rate all items in DB
+        for item in session.query(FoundItem):
+                item.rating = clf.scan(item.name)
+        session.commit()
+        bottle.redirect("/")
+
+if __name__ == "__main__":
+        schedule.every(10).minutes.do(updateHN)
+        st = SchedThread()
+        #st.daemon = True
+        st.start()
+        bottle.run(host="0.0.0.0",port=55512)