# Use following procedure
# Remove empty lines and script tags
# Initialize the TTRArray
# For each line in document
# X = non tag ASCII characters
# Y = # of tags in the line
# TTRArray[current line] = X if no tags, else TTRArray[current line] = X / Y
# #
import lxml
import os
from lxml.html.clean import Cleaner
cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True # This is True because we want to activate the styles & stylesheet filter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
def remove_htmltags(text_string):
import re
notag = re.sub("<.*?>", " ", text_string) #via http://stackoverflow.com/questions/3351485/how-to-remove-all-html-tags-from-downloaded-page
return(notag)
def html_open(filename):
with open(filename) as f:
mylist = f.read().splitlines()
return(mylist)
def tag_ratio(line):
starting_length = len(line)
chars = len(remove_htmltags(line))
if chars-starting_length==0:
return 1
else:
return( chars/(starting_length-chars) )
from lxml.html.clean import Cleaner
def clean_word_text(text):
# The only thing I need Cleaner for is to clear out the contents of
# <style>...</style> tags
cleaner = Cleaner(style=True)
return cleaner.clean_html(text)
def tag_ratio_distribution(input_html_file):
return(pd.Series([tag_ratio(line) for line in html_open(input_html_file)]))
def tag_ratio_with_line_numbers(input_html_file, filename):
tag_ratios = TR_ratios_per_line = tag_ratio_distribution(input_html_file)
total_num_lines = len(TR_ratios_per_line)
# print(total_num_lines)
line_numbers = pd.Series(range(0, total_num_lines))
filename = pd.Series([str(filename)[0:5]] * total_num_lines)
DF = pd.concat([filename, line_numbers, tag_ratios], axis=1)
return(DF)
def save_to_tsv(dataframe, filename='data.tsv'):
dataframe.to_csv(filename, sep='\t', index=False)
nyt_tag_ratios = tag_ratio_distribution('nyt.html')
nyt_tag_ratios[0:10]
html_files_to_tagratio = os.listdir("/Users/olive/polar_only_rasmuson/polar_only_rasmuson_html_tika_output")[1:] #exclude the .DS_Store file
TR_for_fileset = [tag_ratio_with_line_numbers("/Users/olive/polar_only_rasmuson/polar_only_rasmuson_html_tika_output/" + x, x) for x in html_files_to_tagratio]
len(TR_for_fileset) #Number of files used
save_to_tsv(pd.concat(TR_for_fileset), 'data1.tsv')