#htmlparser
Explore tagged Tumblr posts
x-yuri · 8 years ago
Text
python: parse html
NOTE It doesn't handle well, e.g. unclosed p tags. For instance,
<p><div></div>
gets turned into
<p> <div></div>
instead of
<p> <div></div>
from html.parser import HTMLParser import re import urllib.request def rm_extra_spaces(s): return re.sub(r'\s+', ' ', s).strip() def rm_spaces(s): return re.sub(r'\s+', '', s) def ellipsize(s, max_len): return s[:max_len - 3] + '...' if len(s) > max_len else s class CategoryPageParse(HTMLParser): def __init__(self, debug=False): super().__init__() self.level = -1 def handle_starttag(self, tag, attrs): attrs = dict(attrs) self.level += 1 self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): self.print_at_level('</%s>' % tag, self.level) self.level -= 1 def handle_data(self, data): data_processed = ellipsize(rm_extra_spaces(data), 50) if data_processed: self.print_at_level(data_processed, self.level + 1) def print_opening_tag(self, tag, attrs): attrs = dict(attrs) els = [tag] if 'id' in attrs: els.append('id="%s"' % attrs['id']) if 'class' in attrs: els.append('class="%s"' % attrs['class']) self.print_at_level('<%s>' % ' '.join(els), self.level) def print_at_level(self, s, level): print('%s%s' % (' ' * level * 4, s))
or to extract some data:
from html.parser import HTMLParser import re import urllib.request def rm_extra_spaces(s): return re.sub(r'\s+', ' ', s).strip() def rm_spaces(s): return re.sub(r'\s+', '', s) def ellipsize(s, max_len): return s[:max_len - 3] + '...' if len(s) > max_len else s class CategoryPageParse(HTMLParser): def __init__(self, debug=False): super().__init__() self.debug = debug self.in_content = False self.in_p = False self.desc = None self.desc_buffer = [] self.level = -1 self.last_starttag = [] def handle_starttag(self, tag, attrs): attrs = dict(attrs) self.level += 1 if attrs.get('id', None) == 'content': self.in_content = True self.content_level = self.level if not self.desc and self.in_content and self.tag_is_visible_p(tag, attrs): self.in_p = True self.p_level = self.level self.last_starttag.append({'tag': tag, 'attrs': attrs}) self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): if self.debug: self.print_at_level('</%s>' % tag, self.level) self.last_starttag.pop() if self.in_p and self.level == self.p_level: self.in_p = False self.desc = ' '.join(self.desc_buffer) if self.in_content and self.level == self.content_level: self.in_content = False self.level -= 1 def handle_data(self, data): if self.in_p: self.desc_buffer.append(rm_extra_spaces(data)) if self.debug: data_processed = ellipsize(rm_extra_spaces(data), 50) if data_processed: self.print_at_level(data_processed, self.level + 1) def tag_is_visible_p(self, tag, attrs): return tag == 'p' and not self.tag_is_invisible(tag, attrs) def tag_is_invisible(self, tag, attrs): attrs = dict(attrs) style_attr = attrs.get('style', '') return re.search(r'(^|;)display:none($|;)', rm_spaces(style_attr)) def print_opening_tag(self, tag, attrs): if not self.debug: return attrs = dict(attrs) els = [tag] if 'id' in attrs: els.append('id="%s"' % attrs['id']) if 'class' in attrs: els.append('class="%s"' % attrs['class']) self.print_at_level('<%s>' % ' '.join(els), self.level) def print_at_level(self, s, level): print('%s%s' % (' ' * level * 4, s))
previous version
#!/usr/bin/env python import re import urllib.request from html.parser import HTMLParser class CategoryPageParse(HTMLParser): def __init__(self): self.level = -1 super().__init__() def handle_starttag(self, tag, attrs): self.level += 1 self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): print('%s%s' % (' ' * self.level * 4, '</' + tag + '>')) self.level -= 1 def handle_data(self, data): data_processed = re.sub(r'\s+', ' ', data).strip() data_processed = data_processed[:47] + '...' if len(data_processed) > 50 else data_processed if data_processed: print('%s%s' % (' ' * (self.level + 1) * 4, data_processed)) def print_opening_tag(self, tag, attrs): attrs = dict(attrs) id = ' id="%s"' % attrs['id'] if attrs.get('id') else '' klass = ' class="%s"' % attrs['class'] if attrs.get('class') else '' print('%s<%s%s%s>' % (' ' * self.level * 4, tag, id, klass)) resp = urllib.request.urlopen('http://example.com') els = re.split(r'\s*;\s*', resp.getheader('Content-Type')) if len(els) > 1: charset_param = next(x for x in els if re.match(r'\s*charset\s*=', x)) els = re.split(r'\s*=\s*', charset_param) charset = els[1].strip() else: charset = 'utf-8' parser = CategoryPageParse() parser.feed(resp.read().decode(charset))
or to extract some data:
class CategoryPageParse(HTMLParser): def __init__(self): self.in_content = False self.in_p = False self.desc = None self.desc_buffer = [] self.level = -1 self.last_starttag = [] super().__init__() def handle_starttag(self, tag, attrs): self.level += 1 if dict(attrs).get('id', None) == 'content': self.in_content = True self.content_level = self.level if self.in_content and tag == 'p': self.in_p = True self.p_level = self.level self.last_starttag.append({'tag': tag, 'attrs': dict(attrs)}) self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): print('%s%s' % (' ' * self.level * 4, '</' + tag + '>')) if self.in_content and self.level == self.content_level: self.in_content = False if not self.desc and self.in_p and self.level == self.p_level: self.in_p = False self.desc = ' '.join(self.desc_buffer) self.last_starttag.pop() self.level -= 1 def handle_data(self, data): if self.in_content and self.in_p: self.desc_buffer.append(re.sub(r'\s+', ' ', data).strip()) data_processed = re.sub(r'\s+', ' ', data).strip() data_processed = data_processed[:47] + '...' if len(data_processed) > 50 else data_processed if data_processed: # pass print('%s%s' % (' ' * (self.level + 1) * 4, data_processed)) def invisible(self, attrs): return re.search(r'(^|;)display:none($|;)', re.sub(r'\s+', '', attrs.get('style', ''))) def print_opening_tag(self, tag, attrs): attrs = dict(attrs) id = ' id="%s"' % attrs['id'] if attrs.get('id') else '' klass = ' class="%s"' % attrs['class'] if attrs.get('class') else '' print('%s<%s%s%s>' % (' ' * self.level * 4, tag, id, klass))
0 notes
awesomearound · 4 years ago
Text
Want suggestions for next post
Tumblr media
Hi there guys. I am Yasoob 🙂  As you know that writing quality tutorials take plenty of time so I was thinking that I should take your suggestion about the next article. These days I have been doing quite a lot of projects. Let me share the details of two of these projects. This project was not an ordinary project. As most of you know that Google maps provide an API to programmatically access it, there was a problem with it. The API was not providing complete routes information for Japan. And in this project I particularly wanted information regarding Japan. In order to complete this project I had to rely on BeautifulSoup, HTMLParser and requests library. It turned out to be more difficult than I had innitially thought. I had to use an old Google maps url. I made a custom API  for Google maps by scraping data in real time. Posting to Facebook group without the Graph API This was a kind of personal project. I wanted to test that how difficult is it to scrape Facebook. The project was to build a Bot. There are two ways to achieve this target without the API. The first one is to use a full fledged Webkit Engine but I wanted to do this the difficult way. I resolved on the second method which involved parsing the HTML page and submitting forms without evaluating the Javascript. This method allowed me to sharpen up my web scraping and research skills. This method takes time to implement but after implementation it is quite fast as compared to the Webkit solution. Read the full article
0 notes
matiascreimerman · 8 years ago
Text
Crear PDF a partir de HTML  Con ItextSharp – Convert HTML to PDF (with ItextSharp)
2013-04-15 · 13:10
Crear PDF a partir de HTML  Con ItextSharp – Convert HTML to PDF (with ItextSharp)
public void HTMLToPDF(string html, string fullDestinyFilePath)        {            StringWriter sw = new StringWriter();            sw.WriteLine(html);            StringReader sr = new StringReader(sw.ToString());            Document pdfDoc = new Document();            HTMLWorker htmlparser = new HTMLWorker(pdfDoc);            PdfWriter.GetInstance(pdfDoc, new FileStream(fullDestinyFilePath, FileMode.Create));            pdfDoc.Open();            htmlparser.Parse(sr);            pdfDoc.Close();        }
Autor: Matías Creimerman
Matías Creimerman – Consultor IT – IT Consultant
Linkedin: http://ar.linkedin.com/in/matiascreimerman
Microsoft ASP.Net Member: http://forums.asp.net/members/matyvegan.aspx
Microsoft Virtual Academy Profile: https://www.microsoftvirtualacademy.com/Profile.aspx?alias=824999
About Me: https://about.me/matiascreimerman
GitHub Repository: https://github.com/mcrei/
Matias Creimerman
0 notes
andrew-chan1181 · 8 years ago
Text
安裝 jupyter (Python 執行程式_利用瀏覽器執行)
1.環境建置
brew install python3
      安裝Python3
pip3 install jupyter
      用pip安裝jupyter套件
jupyter notebook
2.爬蟲用套件
pip3 install requests
pip3 install beautifulsoup4
       Python3用beautifulsoup4抓網頁
sudo pip3 install HTMLParser
      Python3用,抓網頁
 pip3 install urlopen
pip3 install parse
pip3 install urlparse
pip3 install urljoin
install urljoin
sudo pip3 install urlparse
sudo pip3 install urllib
pip3 install urllib3
3. 結巴套件
pip3 install jieba
      Python3用jieba
jupyter notebook
0 notes
hinatabokori · 8 years ago
Link
0 notes
sergioneddi · 10 years ago
Link
PHP Simple HTML DOM Parser CSS Selector
4 notes · View notes
phdhwang · 12 years ago
Text
python HTMLParser 연습
#!/usr/bin/python import urllib2 import HTMLParser class MyParser(HTMLParser.HTMLParser): # # ==> attr : ('property', 'og:type') # ==> attr: ('content', 'xxxxx-feed:photo') # # ==> attr: ('property', 'og:image') # ==> attr: ('content', 'http://media.com/8a7ef6a/mf8ylpaOK51rx0ocqo1_500.gif') def __init__(self): HTMLParser.HTMLParser.__init__(self) self.found_type = False self.found_photo = False self.found_image = False self.image = '' def handle_starttag(self, tag, attrs): if tag != 'meta' : return for attr in attrs: #print " attr:", attr if self.found_type==False: if attr == ('property','og:type'): #print " attr:", attr self.found_type = True else: if self.found_photo==False and attr == ('content','xxxxx-feed:photo'): self.found_type = True elif attr == ('property','og:image'): self.found_image = True elif attr[0] == 'content' and self.found_image: print " attr:", attr self.image = attr[1]; else: self.found_image = False r = urllib2.urlopen('http://YOUR.xxxxx.com/random') d = r.read().decode('utf-8'); p = MyParser() #d = ' ' p.feed(d) print p.image
1 note · View note
maximumdx · 10 years ago
Text
XML Processing with Python: Part Four
XML Processing with Python: Part Four
XML is similar in structure and form to HTML. This is not entirely an accidental thing. XML and HTML both originated from SGML and share a number of syntactic features. The earlier versions of HTML are not directly compatible with XML, though, because XML requires that every tag be closed, and certain HTML tags don’t require a closing tag (such as and ). However, the W3C has declared the XHTML…
View On WordPress
0 notes