download.file()
in R, wget
in Python/Terminal)rvest
in R, beautifulsoup
in Python)RSelenium
in R, selenium
in Python)httr
in R, requests
in Python)<!DOCTYPE html>
<html>
<head>
<title>A title</title>
</head>
<body>
<h1 style="color:Red;">A heading</h1>
<p>A paragraph.</p>
</body>
</html>
Extra: W3Schools: Try HTML
<h1>
)</h1>
)style="color:Red;"
)<html>
, <body>
, <header>
<h1>
, <title>
, <div>
<b>
, <i>
<a>
library("rvest")
html_txt <- "
<!DOCTYPE html>
<html>
<head>
<title>A title</title>
</head>
<body>
<h1 style='color:Red;'>A heading</h1>
<p>A paragraph.</p>
</body>
</html>"
html <- rvest::read_html(html_txt)
str(html)
List of 2 $ node:<externalptr> $ doc :<externalptr> - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
children <- rvest::html_children(html)
children
{xml_nodeset (2)} [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ... [2] <body>\n <h1 style="color:Red;">A heading</h1> \n <p>A para ...
body <- children[2]
rvest::html_name(body)
[1] "body"
children2 <- rvest::html_children(body)
children2
{xml_nodeset (2)} [1] <h1 style="color:Red;">A heading</h1> [2] <p>A paragraph.</p>
rvest::html_attrs(children2[1])
[[1]] style "color:Red;"
rvest::html_text(children2[1])
[1] "A heading"
<?xml version="1.0" encoding="UTF-8" ?>
<courses>
<course>
<title>Computer Programming for Social Scientists</title>
<code>POP77001</code>
<year>2021</year>
<term>Michaelmas</term>
<description>Course on computer programming in Python and R.</description>
</course>
<course>
<title>Applied Statistical Analysis I</title>
<code>POP77003</code>
<year>2021</year>
<term>Michaelmas</term>
<description>Introduction to statistical inference.</description>
</course>
</courses>
library("xml2")
xml_txt <-
'<?xml version="1.0" encoding="UTF-8" ?>
<courses>
<course>
<title>Computer Programming for Social Scientists</title>
<code>POP77001</code>
<year>2021</year>
<term>Michaelmas</term>
<description>Course on computer programming in Python and R.</description>
</course>
<course>
<title>Applied Statistical Analysis I</title>
<code>POP77003</code>
<year>2021</year>
<term>Michaelmas</term>
<description>Introduction to statistical inference.</description>
</course>
</courses>'
xml <- xml2::read_xml(xml_txt)
str(xml)
List of 2 $ node:<externalptr> $ doc :<externalptr> - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
children3 <- xml2::xml_children(xml)
children3
{xml_nodeset (2)} [1] <course>\n <title>Computer Programming for Social Scientists</title>\n ... [2] <course>\n <title>Applied Statistical Analysis I</title>\n <code>POP770 ...
pop77001 <- children3[1]
xml2::xml_children(pop77001)
{xml_nodeset (5)} [1] <title>Computer Programming for Social Scientists</title> [2] <code>POP77001</code> [3] <year>2021</year> [4] <term>Michaelmas</term> [5] <description>Course on computer programming in Python and R.</description>
xml2::xml_text(xml_children(children3[1]))
[1] "Computer Programming for Social Scientists" [2] "POP77001" [3] "2021" [4] "Michaelmas" [5] "Course on computer programming in Python and R."
.docx
, .xlsx
, .pptx
, OpenOffice/LibreOffice)/
- select element at the root node (e.g. /html/body
)//
- select element at any depth (e.g. //h1
)//<tag>/*
- select all descendants of tag (e.g. //body/*
)//<tag>[@<attr>]
- select all elements that have given attribute (e.g. //h1[@style]
)//<tag>[@<attr>='<value>']
- select all elements, whose attribute has given value (e.g. //h1[@style='color:Red;']
)Extra: XPath syntax
rvest::html_elements(html, xpath = "//p")
{xml_nodeset (1)} [1] <p>A paragraph.</p>
rvest::html_elements(html, xpath = "//h1[@style='color:Red;']")
{xml_nodeset (1)} [1] <h1 style="color:Red;">A heading</h1>
xml2::xml_find_all(xml, xpath = "//code")
{xml_nodeset (2)} [1] <code>POP77001</code> [2] <code>POP77003</code>
# We can also find elements by text
xml2::xml_find_all(xml, xpath = "//code[text()='POP77001']")
{xml_nodeset (1)} [1] <code>POP77001</code>
html <- rvest::read_html("https://en.wikipedia.org/wiki/Members_of_the_1st_D%C3%A1il")
tables <- rvest::html_elements(html, xpath = "//table")
tables
{xml_nodeset (8)} [1] <table class="box-More_citations_needed plainlinks metadata ambox ambox-c ... [2] <table class="infobox vevent"><tbody>\n<tr><th colspan="2" class="infobox ... [3] <table style="width:100%; border-collapse:collapse"><tbody><tr style="ver ... [4] <table class="wikitable" style="font-size: 95%;"><tbody>\n<tr style="back ... [5] <table class="wikitable" style="margin: 1em 1em 1em 0; background: #f9f9f ... [6] <table class="wikitable"><tbody>\n<tr>\n<th>Constituency\n</th>\n<th>Outg ... [7] <table class="wikitable"><tbody>\n<tr>\n<th>Winner\n</th>\n<th colspan="2 ... [8] <table class="nowraplinks mw-collapsible autocollapse navbox-inner" style ...
tbody <- rvest::html_children(tables[5])
tbody
{xml_nodeset (1)} [1] <tbody>\n<tr style="background-color:#E9E9E9;"><th colspan="4">Members of ...
tds <- rvest::html_table(tbody)
tds
[[1]] # A tibble: 106 × 4 `Members of the 1s… `Members of the 1s… `Members of the 1… `Members of the 1… <chr> <chr> <chr> <chr> 1 Constituency Name "Party" Party 2 Antrim East Robert McCalmont "" Irish Unionist 3 Antrim Mid Hugh O'Neill "" Irish Unionist 4 Antrim North Peter Kerr-Smiley "" Irish Unionist 5 Antrim South Charles Curtis Cra… "" Irish Unionist 6 Armagh Mid James Rolston Lons… "" Irish Unionist 7 Armagh North William Allen "" Irish Unionist 8 Armagh South Patrick Donnelly "" Irish Parliamenta… 9 Belfast Cromac William Arthur Lin… "" Irish Unionist 10 Belfast Duncairn Edward Carson "" Irish Unionist # … with 96 more rows
str(tds)
List of 1 $ : tibble [106 × 4] (S3: tbl_df/tbl/data.frame) ..$ Members of the 1st Dáil[4]: chr [1:106] "Constituency" "Antrim East" "Antrim Mid" "Antrim North" ... ..$ Members of the 1st Dáil[4]: chr [1:106] "Name" "Robert McCalmont" "Hugh O'Neill" "Peter Kerr-Smiley" ... ..$ Members of the 1st Dáil[4]: chr [1:106] "Party" "" "" "" ... ..$ Members of the 1st Dáil[4]: chr [1:106] "Party" "Irish Unionist" "Irish Unionist" "Irish Unionist" ...
tds <- tds[[1]]
head(tds)
Members of the 1st Dáil[4] Members of the 1st Dáil[4] 1 Constituency Name 2 Antrim East Robert McCalmont 3 Antrim Mid Hugh O'Neill 4 Antrim North Peter Kerr-Smiley 5 Antrim South Charles Curtis Craig 6 Armagh Mid James Rolston Lonsdale Members of the 1st Dáil[4] Members of the 1st Dáil[4] 1 Party Party 2 Irish Unionist 3 Irish Unionist 4 Irish Unionist 5 Irish Unionist 6 Irish Unionist
colnames(tds) <- tds[1,]
tds <- tds[-1,]
head(tds)
Constituency Name Party Party 1 Antrim East Robert McCalmont Irish Unionist 2 Antrim Mid Hugh O'Neill Irish Unionist 3 Antrim North Peter Kerr-Smiley Irish Unionist 4 Antrim South Charles Curtis Craig Irish Unionist 5 Armagh Mid James Rolston Lonsdale Irish Unionist 6 Armagh North William Allen Irish Unionist
tds <- tds[,-3]
str(tds)
tibble [105 × 3] (S3: tbl_df/tbl/data.frame) $ Constituency: chr [1:105] "Antrim East" "Antrim Mid" "Antrim North" "Antrim South" ... $ Name : chr [1:105] "Robert McCalmont" "Hugh O'Neill" "Peter Kerr-Smiley" "Charles Curtis Craig" ... $ Party : chr [1:105] "Irish Unionist" "Irish Unionist" "Irish Unionist" "Irish Unionist" ...