#htmlparser | Explore Tumblr posts and blogs

#htmlparser

Explore tagged Tumblr posts

Visit Tumblr Blog

Explore Tumblr blogs with no restrictions, modern design and the best experience.

Last Seen Tumblr Blogs

hpcomsetup-blog

Untitled

3 posts

hkaeaexchangetour-blog

香港藝術交流協會

1 post

catovyen

Professional Idiot

2K posts

iwebscraping19-blog

Untitled

19 posts

taktentumasa

Untitled

595 posts

Fun Fact

Forty percent of Tumblr users are between the ages of 18 to 25.

x-yuri · 8 years ago

Text

python: parse html

NOTE It doesn't handle well, e.g. unclosed p tags. For instance,

gets turned into

instead of

or to extract some data:

from html.parser import HTMLParser import re import urllib.request def rm_extra_spaces(s): return re.sub(r'\s+', ' ', s).strip() def rm_spaces(s): return re.sub(r'\s+', '', s) def ellipsize(s, max_len): return s[:max_len - 3] + '...' if len(s) > max_len else s class CategoryPageParse(HTMLParser): def __init__(self, debug=False): super().__init__() self.debug = debug self.in_content = False self.in_p = False self.desc = None self.desc_buffer = [] self.level = -1 self.last_starttag = [] def handle_starttag(self, tag, attrs): attrs = dict(attrs) self.level += 1 if attrs.get('id', None) == 'content': self.in_content = True self.content_level = self.level if not self.desc and self.in_content and self.tag_is_visible_p(tag, attrs): self.in_p = True self.p_level = self.level self.last_starttag.append({'tag': tag, 'attrs': attrs}) self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): if self.debug: self.print_at_level('</%s>' % tag, self.level) self.last_starttag.pop() if self.in_p and self.level == self.p_level: self.in_p = False self.desc = ' '.join(self.desc_buffer) if self.in_content and self.level == self.content_level: self.in_content = False self.level -= 1 def handle_data(self, data): if self.in_p: self.desc_buffer.append(rm_extra_spaces(data)) if self.debug: data_processed = ellipsize(rm_extra_spaces(data), 50) if data_processed: self.print_at_level(data_processed, self.level + 1) def tag_is_visible_p(self, tag, attrs): return tag == 'p' and not self.tag_is_invisible(tag, attrs) def tag_is_invisible(self, tag, attrs): attrs = dict(attrs) style_attr = attrs.get('style', '') return re.search(r'(^|;)display:none($|;)', rm_spaces(style_attr)) def print_opening_tag(self, tag, attrs): if not self.debug: return attrs = dict(attrs) els = [tag] if 'id' in attrs: els.append('id="%s"' % attrs['id']) if 'class' in attrs: els.append('class="%s"' % attrs['class']) self.print_at_level('<%s>' % ' '.join(els), self.level) def print_at_level(self, s, level): print('%s%s' % (' ' * level * 4, s))

previous version

#!/usr/bin/env python import re import urllib.request from html.parser import HTMLParser class CategoryPageParse(HTMLParser): def __init__(self): self.level = -1 super().__init__() def handle_starttag(self, tag, attrs): self.level += 1 self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): print('%s%s' % (' ' * self.level * 4, '</' + tag + '>')) self.level -= 1 def handle_data(self, data): data_processed = re.sub(r'\s+', ' ', data).strip() data_processed = data_processed[:47] + '...' if len(data_processed) > 50 else data_processed if data_processed: print('%s%s' % (' ' * (self.level + 1) * 4, data_processed)) def print_opening_tag(self, tag, attrs): attrs = dict(attrs) id = ' id="%s"' % attrs['id'] if attrs.get('id') else '' klass = ' class="%s"' % attrs['class'] if attrs.get('class') else '' print('%s<%s%s%s>' % (' ' * self.level * 4, tag, id, klass)) resp = urllib.request.urlopen('http://example.com') els = re.split(r'\s*;\s*', resp.getheader('Content-Type')) if len(els) > 1: charset_param = next(x for x in els if re.match(r'\s*charset\s*=', x)) els = re.split(r'\s*=\s*', charset_param) charset = els[1].strip() else: charset = 'utf-8' parser = CategoryPageParse() parser.feed(resp.read().decode(charset))

or to extract some data:

class CategoryPageParse(HTMLParser): def __init__(self): self.in_content = False self.in_p = False self.desc = None self.desc_buffer = [] self.level = -1 self.last_starttag = [] super().__init__() def handle_starttag(self, tag, attrs): self.level += 1 if dict(attrs).get('id', None) == 'content': self.in_content = True self.content_level = self.level if self.in_content and tag == 'p': self.in_p = True self.p_level = self.level self.last_starttag.append({'tag': tag, 'attrs': dict(attrs)}) self.print_opening_tag(tag, attrs) def handle_endtag(self, tag): print('%s%s' % (' ' * self.level * 4, '</' + tag + '>')) if self.in_content and self.level == self.content_level: self.in_content = False if not self.desc and self.in_p and self.level == self.p_level: self.in_p = False self.desc = ' '.join(self.desc_buffer) self.last_starttag.pop() self.level -= 1 def handle_data(self, data): if self.in_content and self.in_p: self.desc_buffer.append(re.sub(r'\s+', ' ', data).strip()) data_processed = re.sub(r'\s+', ' ', data).strip() data_processed = data_processed[:47] + '...' if len(data_processed) > 50 else data_processed if data_processed: # pass print('%s%s' % (' ' * (self.level + 1) * 4, data_processed)) def invisible(self, attrs): return re.search(r'(^|;)display:none($|;)', re.sub(r'\s+', '', attrs.get('style', ''))) def print_opening_tag(self, tag, attrs): attrs = dict(attrs) id = ' id="%s"' % attrs['id'] if attrs.get('id') else '' klass = ' class="%s"' % attrs['class'] if attrs.get('class') else '' print('%s<%s%s%s>' % (' ' * self.level * 4, tag, id, klass))

#en #tech #dev #python #html #parse #urllib #htmlparser

0 notes

awesomearound · 4 years ago

Text

Want suggestions for next post

Hi there guys. I am Yasoob 🙂 As you know that writing quality tutorials take plenty of time so I was thinking that I should take your suggestion about the next article. These days I have been doing quite a lot of projects. Let me share the details of two of these projects. This project was not an ordinary project. As most of you know that Google maps provide an API to programmatically access it, there was a problem with it. The API was not providing complete routes information for Japan. And in this project I particularly wanted information regarding Japan. In order to complete this project I had to rely on BeautifulSoup, HTMLParser and requests library. It turned out to be more difficult than I had innitially thought. I had to use an old Google maps url. I made a custom API for Google maps by scraping data in real time. Posting to Facebook group without the Graph API This was a kind of personal project. I wanted to test that how difficult is it to scrape Facebook. The project was to build a Bot. There are two ways to achieve this target without the API. The first one is to use a full fledged Webkit Engine but I wanted to do this the difficult way. I resolved on the second method which involved parsing the HTML page and submitting forms without evaluating the Javascript. This method allowed me to sharpen up my web scraping and research skills. This method takes time to implement but after implementation it is quite fast as compared to the Webkit solution. Read the full article

#Post #suggestions

0 notes

matiascreimerman · 8 years ago

Text

Crear PDF a partir de HTML Con ItextSharp – Convert HTML to PDF (with ItextSharp)

2013-04-15 · 13:10

Crear PDF a partir de HTML Con ItextSharp – Convert HTML to PDF (with ItextSharp)

public void HTMLToPDF(string html, string fullDestinyFilePath) { StringWriter sw = new StringWriter(); sw.WriteLine(html); StringReader sr = new StringReader(sw.ToString()); Document pdfDoc = new Document(); HTMLWorker htmlparser = new HTMLWorker(pdfDoc); PdfWriter.GetInstance(pdfDoc, new FileStream(fullDestinyFilePath, FileMode.Create)); pdfDoc.Open(); htmlparser.Parse(sr); pdfDoc.Close(); }

Autor: Matías Creimerman

Matías Creimerman – Consultor IT – IT Consultant

Linkedin: http://ar.linkedin.com/in/matiascreimerman

Microsoft ASP.Net Member: http://forums.asp.net/members/matyvegan.aspx

Microsoft Virtual Academy Profile: https://www.microsoftvirtualacademy.com/Profile.aspx?alias=824999

About Me: https://about.me/matiascreimerman

GitHub Repository: https://github.com/mcrei/

Matias Creimerman

0 notes

andrew-chan1181 · 8 years ago

Text

安裝 jupyter (Python 執行程式＿利用瀏覽器執行)

1.環境建置

brew install python3

安裝Python3

pip3 install jupyter

用pip安裝jupyter套件

jupyter notebook

2.爬蟲用套件

pip3 install requests

pip3 install beautifulsoup4

Python3用beautifulsoup4抓網頁

sudo pip3 install HTMLParser

Python3用，抓網頁

pip3 install urlopen

pip3 install parse

pip3 install urlparse

pip3 install urljoin

install urljoin

sudo pip3 install urlparse

sudo pip3 install urllib

pip3 install urllib3

3. 結巴套件

pip3 install jieba

Python3用jieba

jupyter notebook

#python #python3 #jupyter

0 notes

hinatabokori · 8 years ago

Link

0 notes

phdhwang · 12 years ago

Text

python HTMLParser 연습

#!/usr/bin/python import urllib2 import HTMLParser class MyParser(HTMLParser.HTMLParser): # # ==> attr : ('property', 'og:type') # ==> attr: ('content', 'xxxxx-feed:photo') # # ==> attr: ('property', 'og:image') # ==> attr: ('content', 'http://media.com/8a7ef6a/mf8ylpaOK51rx0ocqo1_500.gif') def __init__(self): HTMLParser.HTMLParser.__init__(self) self.found_type = False self.found_photo = False self.found_image = False self.image = '' def handle_starttag(self, tag, attrs): if tag != 'meta' : return for attr in attrs: #print " attr:", attr if self.found_type==False: if attr == ('property','og:type'): #print " attr:", attr self.found_type = True else: if self.found_photo==False and attr == ('content','xxxxx-feed:photo'): self.found_type = True elif attr == ('property','og:image'): self.found_image = True elif attr[0] == 'content' and self.found_image: print " attr:", attr self.image = attr[1]; else: self.found_image = False r = urllib2.urlopen('http://YOUR.xxxxx.com/random') d = r.read().decode('utf-8'); p = MyParser() #d = ' ' p.feed(d) print p.image

#python #HTMLParser

1 note · View note

maximumdx · 10 years ago

Text

XML Processing with Python: Part Four

XML is similar in structure and form to HTML. This is not entirely an accidental thing. XML and HTML both originated from SGML and share a number of syntactic features. The earlier versions of HTML are not directly compatible with XML, though, because XML requires that every tag be closed, and certain HTML tags don’t require a closing tag (such as and ). However, the W3C has declared the XHTML…

View On WordPress

#HTML #HTMLParser #Python #SGML #XHTML #XML #XML processing

0 notes