Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Cannot get h4 tag text
#1
import requests
from bs4 import BeautifulSoup
website= requests.get("http://www.aniyanetworks.net/contact/")
content1=website.content

soup=BeautifulSoup(content1,"html.parser")
all=soup.find_all("div",{"class":"vc_cta3-content"})
#print(all)
ok=all[0].find_all("h4")
print(ok)
Output:
[<h4>Call us Now: 416-970-8844</h4>]
How can i get only the text, not <h4> tags.
Reply
#2
When in doubt, use dir() or help() to ask the object how it thinks it should be used. In this case, there's a .text attribute which seems like a likely candidate for getting the text.

>>> import requests
>>> from bs4 import BeautifulSoup as bs
>>> site = requests.get("http://www.aniyanetworks.net/contact")
>>> soup = bs(site.content, "html.parser")
>>> divs = soup.find_all("div", {"class": "vc_cta3-content"})
>>> headers = [elem.find_all("h4") for elem in divs]
>>> headers
[[<h4>Call us Now: 416-970-8844</h4>], [<h4>8975 McLaughlin road south , Unit #6, Brampton ,ON, L6Y0Z6, Canada</h4>]]
>>> dir(headers[0][0])
['HTML_FORMATTERS', 'XML_FORMATTERS', '__bool__', '__call__', '__class__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_all_strings', '_attr_value_as_string', '_attribute_checker', '_find_all', '_find_one', '_formatter_for_name', '_is_xml', '_lastRecursiveChild', '_last_descendant', '_select_debug', '_selector_combinators', '_should_pretty_print', '_tag_name_matches_and', 'append', 'attribselect_re', 'attrs', 'can_be_empty_element', 'childGenerator', 'children', 'clear', 'contents', 'decode', 'decode_contents', 'decompose', 'descendants', 'encode', 'encode_contents', 'extract', 'fetchNextSiblings', 'fetchParents', 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAll', 'findAllNext', 'findAllPrevious', 'findChild', 'findChildren', 'findNext', 'findNextSibling', 'findNextSiblings', 'findParent', 'findParents', 'findPrevious', 'findPreviousSibling', 'findPreviousSiblings', 'find_all', 'find_all_next', 'find_all_previous', 'find_next', 'find_next_sibling', 'find_next_siblings', 'find_parent', 'find_parents', 'find_previous', 'find_previous_sibling', 'find_previous_siblings', 'format_string', 'get', 'getText', 'get_attribute_list', 'get_text', 'has_attr', 'has_key', 'hidden', 'index', 'insert', 'insert_after', 'insert_before', 'isSelfClosing', 'is_empty_element', 'known_xml', 'name', 'namespace', 'next', 'nextGenerator', 'nextSibling', 'nextSiblingGenerator', 'next_element', 'next_elements', 'next_sibling', 'next_siblings', 'parent', 'parentGenerator', 'parents', 'parserClass', 'parser_class', 'prefix', 'preserve_whitespace_tags', 'prettify', 'previous', 'previousGenerator', 'previousSibling', 'previousSiblingGenerator', 'previous_element', 'previous_elements', 'previous_sibling', 'previous_siblings', 'quoted_colon', 'recursiveChildGenerator', 'renderContents', 'replaceWith', 'replaceWithChildren', 'replace_with', 'replace_with_children', 'select', 'select_one', 'setup', 'string', 'strings', 'stripped_strings', 'tag_name_re', 'text', 'unwrap', 'wrap']
>>> headers[0][0].text
'Call us Now: 416-970-8844'
Reply
#3
(May-20-2019, 09:58 PM)nilamo Wrote: When in doubt, use dir() or help() to ask the object how it thinks it should be used. In this case, there's a .text attribute which seems like a likely candidate for getting the text.

>>> import requests
>>> from bs4 import BeautifulSoup as bs
>>> site = requests.get("http://www.aniyanetworks.net/contact")
>>> soup = bs(site.content, "html.parser")
>>> divs = soup.find_all("div", {"class": "vc_cta3-content"})
>>> headers = [elem.find_all("h4") for elem in divs]
>>> headers
[[<h4>Call us Now: 416-970-8844</h4>], [<h4>8975 McLaughlin road south , Unit #6, Brampton ,ON, L6Y0Z6, Canada</h4>]]
>>> dir(headers[0][0])
['HTML_FORMATTERS', 'XML_FORMATTERS', '__bool__', '__call__', '__class__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_all_strings', '_attr_value_as_string', '_attribute_checker', '_find_all', '_find_one', '_formatter_for_name', '_is_xml', '_lastRecursiveChild', '_last_descendant', '_select_debug', '_selector_combinators', '_should_pretty_print', '_tag_name_matches_and', 'append', 'attribselect_re', 'attrs', 'can_be_empty_element', 'childGenerator', 'children', 'clear', 'contents', 'decode', 'decode_contents', 'decompose', 'descendants', 'encode', 'encode_contents', 'extract', 'fetchNextSiblings', 'fetchParents', 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAll', 'findAllNext', 'findAllPrevious', 'findChild', 'findChildren', 'findNext', 'findNextSibling', 'findNextSiblings', 'findParent', 'findParents', 'findPrevious', 'findPreviousSibling', 'findPreviousSiblings', 'find_all', 'find_all_next', 'find_all_previous', 'find_next', 'find_next_sibling', 'find_next_siblings', 'find_parent', 'find_parents', 'find_previous', 'find_previous_sibling', 'find_previous_siblings', 'format_string', 'get', 'getText', 'get_attribute_list', 'get_text', 'has_attr', 'has_key', 'hidden', 'index', 'insert', 'insert_after', 'insert_before', 'isSelfClosing', 'is_empty_element', 'known_xml', 'name', 'namespace', 'next', 'nextGenerator', 'nextSibling', 'nextSiblingGenerator', 'next_element', 'next_elements', 'next_sibling', 'next_siblings', 'parent', 'parentGenerator', 'parents', 'parserClass', 'parser_class', 'prefix', 'preserve_whitespace_tags', 'prettify', 'previous', 'previousGenerator', 'previousSibling', 'previousSiblingGenerator', 'previous_element', 'previous_elements', 'previous_sibling', 'previous_siblings', 'quoted_colon', 'recursiveChildGenerator', 'renderContents', 'replaceWith', 'replaceWithChildren', 'replace_with', 'replace_with_children', 'select', 'select_one', 'setup', 'string', 'strings', 'stripped_strings', 'tag_name_re', 'text', 'unwrap', 'wrap']
>>> headers[0][0].text
'Call us Now: 416-970-8844'


Thank you ,
Do you mind to lil bit explain to me what you did with "headers = [elem.find_all("h4") for elem in divs]"
Please. i am confused here.
Thank you
Reply
#4
import requests
from bs4 import BeautifulSoup

website = requests.get("http://www.aniyanetworks.net/contact/")
soup = BeautifulSoup(website.content,"html.parser")
vc_tag = soup.find_all("div", class_='vc_cta3-content')
#print(vc_tag)
print(vc_tag[0].find("h4").text)
Output:
Call us Now: 416-970-8844
aniyanetworks Wrote:Do you mind to lil bit explain to me what you did with "headers = [elem.find_all("h4") for elem in divs]"
He use a list list comprehension that will get both h4 tag.
Can also just use find() in that list comprehension.
>>> [elem.find("h4") for elem in vc_tag]
[<h4>Call us Now: 416-970-8844</h4>,
 <h4>8975 McLaughlin road south , Unit #6, Brampton ,ON, L6Y0Z6, Canada</h4>]
>>> 
>>> e = [elem.find("h4").text for elem in vc_tag]
>>> e
['Call us Now: 416-970-8844',
 '8975 McLaughlin road south , Unit #6, Brampton ,ON, L6Y0Z6, Canada']

>>> for text in e:
...     print(text)
...     
Call us Now: 416-970-8844
8975 McLaughlin road south , Unit #6, Brampton ,ON, L6Y0Z6, Canada
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020