Example URL https://bioconductor.org/packages/release/bioc/VIEWS
Currently I’m splitting each individual clump of metadata by every blank line, then converting to a dictionary splitting on the first colon using the string before as the key and the string after as the value. THE ISSUE I’m running is that I am going line by line through each package metadata, some lines do not have colons and I want to append that to the previous value as one complete string.
JavaScript
x
9
1
response = requests.get(
2
'https://bioconductor.org/packages/release/bioc/VIEWS')
3
4
package_list = response.text.split('nn')
5
6
package_dict = {
7
package_list.split(':')[0]: package_list.split(':')[1] for package in package_list
8
}
9
Advertisement
Answer
Try using regex to parse the data:
JavaScript
1
17
17
1
import re
2
import requests
3
4
url = "https://bioconductor.org/packages/release/bioc/VIEWS"
5
data = requests.get(url).text
6
7
pat = re.compile(
8
r"^([^s][^:]*): (.+?)s*(?=^[^s][^:]*:|Z)", flags=re.S | re.M
9
)
10
11
out = []
12
for chunk in data.split("nn"):
13
if chunk:
14
out.append(dict(pat.findall(chunk)))
15
16
print(out)
17
Prints:
JavaScript
1
65
65
1
[
2
{
3
"Package": "a4",
4
"Version": "1.44.0",
5
"Depends": "a4Base, a4Preproc, a4Classif, a4Core, a4Reporting",
6
"Suggests": "MLP, nlcv, ALL, Cairo, Rgraphviz, GOstats",
7
"License": "GPL-3",
8
"MD5sum": "cc696d3373a9f258d293f2d966da11d5",
9
"NeedsCompilation": "no",
10
"Title": "Automated Affymetrix Array Analysis Umbrella Package",
11
"Description": "Umbrella package is available for the entire Automatedn Affymetrix Array Analysis suite of package.",
12
"biocViews": "Microarray",
13
"Author": "Willem Talloen [aut], Tobias Verbeke [aut], Laure Cougnaudn [cre]",
14
"Maintainer": "Laure Cougnaud <laure.cougnaud@openanalytics.eu>",
15
"git_url": "https://git.bioconductor.org/packages/a4",
16
"git_branch": "RELEASE_3_15",
17
"git_last_commit": "5b0fc5a",
18
"git_last_commit_date": "2022-04-26",
19
"Date/Publication": "2022-04-26",
20
"source.ver": "src/contrib/a4_1.44.0.tar.gz",
21
"win.binary.ver": "bin/windows/contrib/4.2/a4_1.44.0.zip",
22
"mac.binary.ver": "bin/macosx/contrib/4.2/a4_1.44.0.tgz",
23
"vignettes": "vignettes/a4/inst/doc/a4vignette.pdf",
24
"vignetteTitles": "a4vignette",
25
"hasREADME": "FALSE",
26
"hasNEWS": "TRUE",
27
"hasINSTALL": "FALSE",
28
"hasLICENSE": "FALSE",
29
"Rfiles": "vignettes/a4/inst/doc/a4vignette.R",
30
"dependencyCount": "82"
31
},
32
{
33
"Package": "a4Base",
34
"Version": "1.44.0",
35
"Depends": "a4Preproc, a4Core",
36
"Imports": "methods, graphics, grid, Biobase, annaffy, mpm, genefilter,n limma, multtest, glmnet, gplots",
37
"Suggests": "Cairo, ALL, hgu95av2.db, nlcv",
38
"Enhances": "gridSVG, JavaGD",
39
"License": "GPL-3",
40
"MD5sum": "094c0a1c87b18ff8f16a3dbe4d06da64",
41
"NeedsCompilation": "no",
42
"Title": "Automated Affymetrix Array Analysis Base Package",
43
"Description": "Base utility functions are available for the Automatedn Affymetrix Array Analysis set of packages.",
44
"biocViews": "Microarray",
45
"Author": "Willem Talloen [aut], Tine Casneuf [aut], An De Bondt [aut],n Steven Osselaer [aut], Hinrich Goehlmann [aut], Willemn Ligtenberg [aut], Tobias Verbeke [aut], Laure Cougnaud [cre]",
46
"Maintainer": "Laure Cougnaud <laure.cougnaud@openanalytics.eu>",
47
"git_url": "https://git.bioconductor.org/packages/a4Base",
48
"git_branch": "RELEASE_3_15",
49
"git_last_commit": "9ae69e0",
50
"git_last_commit_date": "2022-04-26",
51
"Date/Publication": "2022-04-26",
52
"source.ver": "src/contrib/a4Base_1.44.0.tar.gz",
53
"win.binary.ver": "bin/windows/contrib/4.2/a4Base_1.44.0.zip",
54
"mac.binary.ver": "bin/macosx/contrib/4.2/a4Base_1.44.0.tgz",
55
"hasREADME": "FALSE",
56
"hasNEWS": "TRUE",
57
"hasINSTALL": "FALSE",
58
"hasLICENSE": "FALSE",
59
"dependsOnMe": "a4",
60
"suggestsMe": "epimutacions",
61
"dependencyCount": "73"
62
},
63
64
and so on.
65