Skip to the content.

Medium Test Analysis

Introduction

The medium test aims to replicate the analysis conducted in Section 41.1.3 using functions from the XML package. This test focuses on transforming XML data into a structured data frame for further analysis and reporting. The XML package, being an older package, offers a straightforward way to convert XML data into data frames, which is particularly useful for handling nested XML structures.The XML document provided in the test is a nested structure, with each <node> element containing potentially nested <node> elements (and some containing attributes). The goal is to extract specific information from this XML document, such as the text values, length, attributes,name,children etc. of nested <node> elements under the root <node> element.

Setting Up the Environment

Section 1: Loading Libraries and parsing XML Content

library(XML)

xml_content <- c(
 '<?xml version="1.0" encoding="UTF-8"?>',
 '<movies>',
 '<movie mins="126" lang="eng">',
 '<title>Good Will Hunting</title>',
 '<director>',
 '<first_name>Gus</first_name>',
 '<last_name>Van Sant</last_name>',
 '</director>',
 '<year>1998</year>',
 '<genre>drama</genre>',
 '</movie>',
 '<movie mins="106" lang="spa">',
 '<title>Y tu mama tambien</title>',
 '<director>',
 '<first_name>Alfonso</first_name>',
 '<last_name>Cuaron</last_name>',
 '</director>',
 '<year>2001</year>',
 '<genre>drama</genre>',
 '</movie>',
 '</movies>'
)

Explanation

doc <- xmlTreeParse(paste(xml_content, collapse = ''), useInternalNodes = TRUE)
doc
## <?xml version="1.0" encoding="UTF-8"?>
## <movies>
##   <movie mins="126" lang="eng">
##     <title>Good Will Hunting</title>
##     <director>
##       <first_name>Gus</first_name>
##       <last_name>Van Sant</last_name>
##     </director>
##     <year>1998</year>
##     <genre>drama</genre>
##   </movie>
##   <movie mins="106" lang="spa">
##     <title>Y tu mama tambien</title>
##     <director>
##       <first_name>Alfonso</first_name>
##       <last_name>Cuaron</last_name>
##     </director>
##     <year>2001</year>
##     <genre>drama</genre>
##   </movie>
## </movies>
## 

Explanation

Section 2: Navigation of XML Tree

2.1 Access the root Node

# Get the root node of the XML document
movies <- xmlRoot(doc)
movies
## <movies>
##   <movie mins="126" lang="eng">
##     <title>Good Will Hunting</title>
##     <director>
##       <first_name>Gus</first_name>
##       <last_name>Van Sant</last_name>
##     </director>
##     <year>1998</year>
##     <genre>drama</genre>
##   </movie>
##   <movie mins="106" lang="spa">
##     <title>Y tu mama tambien</title>
##     <director>
##       <first_name>Alfonso</first_name>
##       <last_name>Cuaron</last_name>
##     </director>
##     <year>2001</year>
##     <genre>drama</genre>
##   </movie>
## </movies>
# Check if the XML document and the root node are identical
identical(doc, movies)
## [1] FALSE

It turns out that doc and movies are not actually identical

Explanation

2.2 Access the children of movies node

# Access the child nodes of the root node
xmlChildren(movies)
## $movie
## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie> 
## 
## $movie
## <movie mins="106" lang="spa">
##   <title>Y tu mama tambien</title>
##   <director>
##     <first_name>Alfonso</first_name>
##     <last_name>Cuaron</last_name>
##   </director>
##   <year>2001</year>
##   <genre>drama</genre>
## </movie> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
# Access the first movie node
good_will <- xmlChildren(movies)[[1]]
good_will
## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie>
# Access the second movie node
tu_mama <- xmlChildren(movies)[[2]]
tu_mama
## <movie mins="106" lang="spa">
##   <title>Y tu mama tambien</title>
##   <director>
##     <first_name>Alfonso</first_name>
##     <last_name>Cuaron</last_name>
##   </director>
##   <year>2001</year>
##   <genre>drama</genre>
## </movie>

Explanation

Section 3: Inspecting first node

3.1 Inspecting contents of the children of movies node

# Access the children nodes of 'good_will'
xmlChildren(good_will)
## $title
## <title>Good Will Hunting</title> 
## 
## $director
## <director>
##   <first_name>Gus</first_name>
##   <last_name>Van Sant</last_name>
## </director> 
## 
## $year
## <year>1998</year> 
## 
## $genre
## <genre>drama</genre> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
# Access the children nodes of 'tu_mama'
xmlChildren(tu_mama)
## $title
## <title>Y tu mama tambien</title> 
## 
## $director
## <director>
##   <first_name>Alfonso</first_name>
##   <last_name>Cuaron</last_name>
## </director> 
## 
## $year
## <year>2001</year> 
## 
## $genre
## <genre>drama</genre> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
# Get the name of the 'good_will' node
xmlName(good_will)
## [1] "movie"
# Get the attributes of the 'good_will' node
xmlAttrs(good_will)
##  mins  lang 
## "126" "eng"
# Get the size (number of children) of the 'good_will' node
xmlSize(good_will)
## [1] 4

Explanation

3.2 Inspecting contents of good_will node

# Iterate over each child node of 'good_will' and print their names
children_nodes <- xmlChildren(good_will)
for (node in children_nodes) {
  print(xmlName(node))
}
## [1] "title"
## [1] "director"
## [1] "year"
## [1] "genre"
# Access the title node of 'good_will'
title1 <- xmlChildren(good_will)[["title"]]
title1
## <title>Good Will Hunting</title>
# Access the children nodes of 'title1'
xmlChildren(title1)
## $text
## Good Will Hunting 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
# Get the text content of 'title1'
xmlValue(title1)
## [1] "Good Will Hunting"

Explanation

Section 4: Inspecting director node

# Access the director node of 'good_will' 
dir1 <- xmlChildren(good_will)[["director"]]
dir1
## <director>
##   <first_name>Gus</first_name>
##   <last_name>Van Sant</last_name>
## </director>
# Access the children nodes of 'dir1'
xmlChildren(dir1)
## $first_name
## <first_name>Gus</first_name> 
## 
## $last_name
## <last_name>Van Sant</last_name> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
# Get the text content of 'dir1'
xmlValue(dir1)
## [1] "GusVan Sant"

Explanation

The following results obtained from the code can be compared with the required section outlinedexample data set in Section 41.1.3 of Computing with Data