[fix] ignore scripts/styles in html_to_text
This commit is contained in:
		
							parent
							
								
									469e08881e
								
							
						
					
					
						commit
						1408859b4b
					
				@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64',
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
 | 
					ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					blocked_tags = ('script',
 | 
				
			||||||
 | 
					                'style')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def gen_useragent():
 | 
					def gen_useragent():
 | 
				
			||||||
    # TODO
 | 
					    # TODO
 | 
				
			||||||
@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser):
 | 
				
			|||||||
    def __init__(self):
 | 
					    def __init__(self):
 | 
				
			||||||
        HTMLParser.__init__(self)
 | 
					        HTMLParser.__init__(self)
 | 
				
			||||||
        self.result = []
 | 
					        self.result = []
 | 
				
			||||||
 | 
					        self.tags = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def handle_starttag(self, tag, attrs):
 | 
				
			||||||
 | 
					        print tag
 | 
				
			||||||
 | 
					        self.tags.append(tag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def handle_endtag(self, tag):
 | 
				
			||||||
 | 
					        print tag,tag
 | 
				
			||||||
 | 
					        if tag != self.tags[-1]:
 | 
				
			||||||
 | 
					            raise Exception("invalid html")
 | 
				
			||||||
 | 
					        self.tags.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def is_valid_tag(self):
 | 
				
			||||||
 | 
					        return not self.tags or self.tags[-1] not in blocked_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_data(self, d):
 | 
					    def handle_data(self, d):
 | 
				
			||||||
 | 
					        if not self.is_valid_tag():
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
        self.result.append(d)
 | 
					        self.result.append(d)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_charref(self, number):
 | 
					    def handle_charref(self, number):
 | 
				
			||||||
 | 
					        if not self.is_valid_tag():
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
        if number[0] in (u'x', u'X'):
 | 
					        if number[0] in (u'x', u'X'):
 | 
				
			||||||
            codepoint = int(number[1:], 16)
 | 
					            codepoint = int(number[1:], 16)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser):
 | 
				
			|||||||
        self.result.append(unichr(codepoint))
 | 
					        self.result.append(unichr(codepoint))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_entityref(self, name):
 | 
					    def handle_entityref(self, name):
 | 
				
			||||||
 | 
					        if not self.is_valid_tag():
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
        # codepoint = htmlentitydefs.name2codepoint[name]
 | 
					        # codepoint = htmlentitydefs.name2codepoint[name]
 | 
				
			||||||
        # self.result.append(unichr(codepoint))
 | 
					        # self.result.append(unichr(codepoint))
 | 
				
			||||||
        self.result.append(name)
 | 
					        self.result.append(name)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user