In [137]:
# Use following procedure
# Remove empty lines and script tags
# Initialize the TTRArray
# For each line in document
#   X = non tag ASCII characters
#   Y = # of tags in the line
#   TTRArray[current line] = X if no tags, else TTRArray[current line] = X / Y
# # 
In [138]:
import lxml
import os
from lxml.html.clean import Cleaner

cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [139]:
def remove_htmltags(text_string):
    import re
    notag = re.sub("<.*?>", " ", text_string) #via http://stackoverflow.com/questions/3351485/how-to-remove-all-html-tags-from-downloaded-page
    return(notag)
In [140]:
def html_open(filename):
    with open(filename) as f:
        mylist = f.read().splitlines() 
    return(mylist)
In [141]:
def tag_ratio(line):
    starting_length = len(line)
    chars = len(remove_htmltags(line))
    if chars-starting_length==0:
        return 1
    else:
        return( chars/(starting_length-chars) )
In [142]:
from lxml.html.clean import Cleaner

def clean_word_text(text):
    # The only thing I need Cleaner for is to clear out the contents of
    # <style>...</style> tags
    cleaner = Cleaner(style=True)
    return cleaner.clean_html(text)
In [143]:
def tag_ratio_distribution(input_html_file):
    return(pd.Series([tag_ratio(line) for line in html_open(input_html_file)]))
In [144]:
def tag_ratio_with_line_numbers(input_html_file, filename):
    tag_ratios = TR_ratios_per_line = tag_ratio_distribution(input_html_file)
    total_num_lines = len(TR_ratios_per_line)
#   print(total_num_lines)
    line_numbers = pd.Series(range(0, total_num_lines))
    filename = pd.Series([str(filename)[0:5]] * total_num_lines)
    DF = pd.concat([filename, line_numbers, tag_ratios], axis=1)
    return(DF)
In [145]:
def save_to_tsv(dataframe, filename='data.tsv'):
    dataframe.to_csv(filename, sep='\t', index=False)
In [146]:
nyt_tag_ratios = tag_ratio_distribution('nyt.html')
In [147]:
nyt_tag_ratios[0:10]
Out[147]:
0    0.071429
1    0.030303
2    0.200000
3    2.769231
4    0.018519
5    0.015152
6    0.025641
7    0.002538
8    0.025641
9    0.000089
dtype: float64
In [148]:
html_files_to_tagratio = os.listdir("/Users/olive/polar_only_rasmuson/polar_only_rasmuson_html_tika_output")[1:] #exclude the .DS_Store file
In [149]:
TR_for_fileset = [tag_ratio_with_line_numbers("/Users/olive/polar_only_rasmuson/polar_only_rasmuson_html_tika_output/" + x, x) for x in html_files_to_tagratio]
In [150]:
len(TR_for_fileset) #Number of files used
Out[150]:
42
In [151]:
save_to_tsv(pd.concat(TR_for_fileset), 'data1.tsv')
In [ ]: