1+ from bs4 import BeautifulSoup as bs
2+ import bs4
3+ import mistune
4+ from timeout_decorator import TimeoutError , timeout
5+ from functools import partial
6+ from typing import List , Union , Collection , Optional , Any , Callable , Iterable
7+ from urllib3 .util import parse_url
8+ import regex
9+ from textacy .preprocess import preprocess_text , normalize_whitespace
10+ import re
11+
12+ # initialize markdown parser
13+ markdown = mistune .Markdown ()
14+
15+ # shamlessly stolen from fastai
16+ ListOrItem = Union [Collection [Any ], int , float , str ]
17+ OptListOrItem = Optional [ListOrItem ]
18+
19+
20+ def compose (funcs :List [Callable ])-> Callable :
21+ """
22+ shamlessly stolen from fastai.core.compose
23+ Compose `funcs`
24+ """
25+ def compose_ (funcs , x , * args , ** kwargs ):
26+ for f in listify (funcs ): x = f (x , * args , ** kwargs )
27+ return x
28+ return partial (compose_ , funcs )
29+
30+
31+ def listify (p :OptListOrItem = None , q :OptListOrItem = None ):
32+ """
33+ shamlessly stolen from fastai.core.lisfify
34+ Make `p` listy and the same length as `q`.
35+ """
36+
37+ if p is None : p = []
38+ elif isinstance (p , str ): p = [p ]
39+ elif not isinstance (p , Iterable ): p = [p ]
40+ #Rank 0 tensors in PyTorch are Iterable but don't have a length.
41+ else :
42+ try : a = len (p )
43+ except : p = [p ]
44+ n = q if type (q )== int else len (p ) if q is None else len (q )
45+ if len (p )== 1 : p = p * n
46+ assert len (p )== n , f'List len mismatch ({ len (p )} vs { n } )'
47+ return list (p )
48+
49+
50+ class md :
51+ "class that organizes functions that can cleanup a namespace"
52+ @staticmethod
53+ def parse (x :str ) -> bs4 .BeautifulSoup :
54+
55+ # find & replace html, which can break things (non-greedy)
56+ x = re .sub (r'<.+?>.+?</.+?>|<[a-zA-Z]{1,}.*?>' , 'xxxhtml' , x , re .DOTALL )
57+
58+ #because former html replacement was non-greedy dedupe html marker
59+ x = re .sub ('(xxxhtml(xxxlnbrk)?(\s)?)+' , ' xxxhtml ' , x )
60+
61+ # fix the linebreak issue from BigQuery
62+ x = re .sub (r'xxxlnbrk( +)?' , '\n ' , x )
63+
64+ @timeout (1 )
65+ def timed_parse (x ):
66+ try :
67+ return bs (markdown (x ), features = "html5lib" )
68+
69+ except TimeoutError :
70+ return bs (markdown ('xxxunabletoparse' ), features = "html5lib" )
71+
72+ return timed_parse (x )
73+
74+ @staticmethod
75+ def prepend (fldname :str , tag :Union [List [str ], str ], soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
76+ for tag in soup .find_all (listify (tag )):
77+ if tag .text .strip () or tag .name == 'hr' :
78+ tag .insert (0 , fldname + ' ' )
79+ return soup
80+
81+ @staticmethod
82+ def enclose (bfldname :str , efldname :str , tag :Union [List [str ], str ], nlines :int , soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
83+ """Helper function for when you want to add a beginning and ending marker to text."""
84+ for tag in soup .find_all (listify (tag )):
85+
86+ # preview the text inside an enclosure show nlines of beginning and nlines of the end.
87+ text_lines = tag .text .split ('\n ' )
88+ if len (text_lines ) <= nlines * 2 :
89+ newstr = tag .text
90+ else :
91+ newstr = '\n ' .join (text_lines [:nlines ] + text_lines [- nlines :])
92+
93+ tag .string = newstr
94+
95+ # add the values of the class attributes, if exist
96+ tag .insert (0 , bfldname + ' ' + (' ' .join (tag ['class' ]) if 'class' in tag .attrs else '' ) + ' ' )
97+
98+ # insert ending tag with/without space depending if last char is \n
99+ if tag .text [- 1 ] == '\n ' :
100+ tag .append (efldname )
101+ else :
102+ tag .append (' ' + efldname )
103+ return soup
104+
105+ @staticmethod
106+ def lst (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
107+ "annotate list elements <ul> and <ol>"
108+ for tag in soup .find_all (['ul' , 'ol' ]):
109+ # clear all the artifacts that are in lists and replace with text.
110+ text = 'xxxlistB ' + tag .getText () + 'xxxlistE'
111+ tag .string = text .strip ()
112+ return soup
113+
114+ @staticmethod
115+ def tbl (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
116+ "annotate table elements <table> only keeping information from header rows"
117+ for tag in soup .find_all ('table' ):
118+ # empty string if there are no table headers.
119+ text = ''
120+ if tag .thead :
121+ text = 'xxtbl ' + '|' .join ([x .getText () for x in tag .thead .find_all ('th' )])
122+ tag .string = text
123+ return soup
124+
125+ @staticmethod
126+ def img (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
127+ for tag in soup .find_all ('img' ):
128+ tag .insert (0 , 'xxximg ' )
129+ if 'alt' in tag .attrs :
130+ tag .insert (1 , tag ['alt' ])
131+ if 'src' in tag .attrs :
132+ tag .append (' xxximgf ' + tag ['src' ].split ('.' )[- 1 ])
133+ return soup
134+
135+ @staticmethod
136+ def lnk (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
137+ for tag in soup .find_all ('a' ):
138+ if 'href' in tag .attrs :
139+ try :
140+ tag .append (' xxxlnkhb ' + parse_url (tag ['href' ]).host + ' xxxlnkhe' )
141+ except :
142+ pass
143+ if 'title' in tag .attrs :
144+ tag .append (' xxxlnktb ' + tag ['title' ] + ' xxxlnkte ' )
145+ return soup
146+
147+ @staticmethod
148+ def get_text (soup :bs4 .BeautifulSoup ) -> str :
149+ "get the raw text"
150+ text = soup .getText ()
151+ #translate newlines back from BigQuery
152+ text = re .sub (r'\n\n+' , '\n ' , text )
153+ #translate double quotes back from BigQuery
154+ text = re .sub (r'xxxdblqte' , ' \" ' , text )
155+ return normalize_whitespace (text )
156+
157+ @staticmethod
158+ def sym (text :str ) -> str :
159+ """generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens."""
160+ text = preprocess_text (text ,
161+ no_emails = True ,
162+ no_phone_numbers = True ,
163+ no_accents = True )
164+
165+ # generalize file paths
166+ file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}|[Cc]:\\\w+(\\[0-9a-zA-Z_\-]+)+'
167+ text = re .sub (file_path_regex , ' xxxfilepath ' , text )
168+
169+ # generalize @ mentions
170+ at_mention_regex = r'\W@\w+'
171+ text = re .sub (at_mention_regex , ' xxxatmention ' , text )
172+
173+ # get date/time
174+ text = re .sub (r'\d+[-/]\d+[-/]\d+(.{0,2})?(\d+:\d+:\d+)' , ' xxxdatetm ' , text )
175+
176+ # strings that have >=4 dots w/o any whitespace in between
177+ text = re .sub (r'(\S+\.\S+){4,}' , 'xxunk' , text )
178+
179+ # things that look like IP addresses
180+ text = re .sub (r'\d+\.\d+.\d+\.\d+' , 'xxunk' , text )
181+
182+ # long strings or numbers
183+ text = re .sub (r'\S{30,}|\d{6,}' , 'xxunk' , text )
184+
185+ # generalize json
186+ json_regex = r'\{(?:[^{}]|(?R))*\}'
187+ text = regex .sub (json_regex , ' xxxjson ' , text )
188+
189+ return text
190+
191+ ### transformations that are the same from factory functions
192+ # large headers: h1
193+ hL = partial (prepend .__func__ , 'xxxhl' , 'h1' )
194+ # medium headers: h2, h3
195+ hM = partial (prepend .__func__ , 'xxxhm' , ['h2' , 'h3' ])
196+ # small headers: h4, h5, h6
197+ hS = partial (prepend .__func__ , 'xxxhs' , ['h4' , 'h5' , 'h6' ])
198+ # code blocks
199+ code = partial (enclose .__func__ , ' xxxcdb ' , ' xxxcde ' , 'code' , 2 )
200+ # paragraph blocks (plain text)
201+ txt = partial (prepend .__func__ , '' , 'p' )
202+ # block quotes
203+ bqt = partial (enclose .__func__ , 'xxxqb' , 'xxxqe' , 'blockquote' , 3 )
204+ # strikethrough
205+ st = partial (enclose .__func__ , 'xxxdelb' , 'xxxdele' , 'del' , 1 )
206+ # horizontal rule
207+ hr = partial (prepend .__func__ , 'xxxhr' , 'hr' )
208+
209+
210+ transform_pre_rules = [md .parse , md .hL , md .hM , md .hS , md .lst , md .bqt ,
211+ md .code , md .tbl , md .st , md .txt , md .lnk , md .img ,
212+ md .hr , md .get_text , md .sym ]
0 commit comments