# this html module is in the public domain # Sam Watkins, 2012 from l import * def split(raw_html): html = raw_html.replace('\r', '') html = raw_html.replace(' ', ' ') lines = re.findall(r'<.*?>\s*|[^<]+', html) lines = [re.sub(r'^\s+|\s+$', '', l) for l in lines] return lines def parse_lines(lines, do_expand_singles=True): parsed = [parse_line(l) for l in lines] if do_expand_singles: parsed = expand_singles(parsed) return parsed def parse_line(l): text = tag = open = close = attrs = None is_tag = re.match(r'^<', l) if is_tag: close = bool(re.match(r'^$', l): close = True (tag, attrs) = re.search(r'([^<>/\s]+)(.*?)/?>$', l).groups() else: text = l # disentify? squeeze spaces? maybe not yet return (text, tag, open, close, attrs, l) def expand_singles(parsed_lines): out = [] for p in parsed_lines: (text, tag, open, close, attrs, raw) = p if open and close: out.append((text, tag, True, False, attrs, raw)) out.append((text, tag, False, True, None, '')) # hmmm...! else: out.append((text, tag, open, close, attrs, raw)) return out def tidy_html_text(t): t = re.sub(r'^\s+|\s+$', '', t) t = re.sub(r'\s+', ' ', t) return t def extract_table_data(parsed, with_th=True, tidy_values=True, a=False): # This does not preseve tags within table cells, # for now I don't need them. rows = [] row = None value = None value_type = None colspan = None # this is 'event driven', not ideal for p in parsed: (text, tag, start, end, attrs, raw) = p # print raw # print " ", text, tag, start, end, repr(attrs) tr = tag == 'tr' td = tag == 'td' th = tag == 'th' cell = td or th ctl = tr or cell if text is not None and value_type: if value == '': value = text else: value += " " + text if ctl and value is not None: if value_type == 'td' or (value_type == 'th' and with_th): if tidy_values: value = tidy_html_text(value) row.append(value) while colspan > 1: row.append('') colspan -= 1 value = None value_type = None colspan = None if start and cell: value = "" value_type = tag # td or th colspan = 1 match = re.search(r'colspan=["\']?(\d+)', attrs) # not quite right! if match: colspan = int(match.group(1)) if a and start and tag == 'a': match = re.search(r'href=["\']?(.*?)["\' >]', attrs) if match: url = match.group(1) # TODO disentify if value: value += " " value += "[%s]" % url if tr: if row is not None: rows.append(row) if start: row = [] if end: row = None return rows def parse_tables(html, need_th=True, with_th=True, tidy_values=True, a=False): lines = split(html) parsed = parse_lines(lines) has_th = None raw_tables = [] # get all tables without other tables in them # This is kind of bad, being 'event driven', rather that procedural. for i in range(0, len(parsed)): (text, tag, start, end, attr, raw) = p = parsed[i] if tag == 'table' and start: table_start = i has_th = 0 if tag == 'th' and table_start is not None: has_th = 1 if tag == 'table' and end and table_start is not None: table_end = i if not need_th or has_th: table = parsed[table_start:table_end+1] raw_tables.append(table) table_start = None has_th = None data = [extract_table_data(t, with_th=with_th, tidy_values=tidy_values, a=a) for t in raw_tables] return data, raw_tables