Example URL https://bioconductor.org/packages/release/bioc/VIEWS
Currently I’m splitting each individual clump of metadata by every blank line, then converting to a dictionary splitting on the first colon using the string before as the key and the string after as the value. THE ISSUE I’m running is that I am going line by line through each package metadata, some lines do not have colons and I want to append that to the previous value as one complete string.
response = requests.get( 'https://bioconductor.org/packages/release/bioc/VIEWS') package_list = response.text.split('nn') package_dict = { package_list.split(':')[0]: package_list.split(':')[1] for package in package_list }
Advertisement
Answer
Try using regex to parse the data:
import re import requests url = "https://bioconductor.org/packages/release/bioc/VIEWS" data = requests.get(url).text pat = re.compile( r"^([^s][^:]*): (.+?)s*(?=^[^s][^:]*:|Z)", flags=re.S | re.M ) out = [] for chunk in data.split("nn"): if chunk: out.append(dict(pat.findall(chunk))) print(out)
Prints:
[ { "Package": "a4", "Version": "1.44.0", "Depends": "a4Base, a4Preproc, a4Classif, a4Core, a4Reporting", "Suggests": "MLP, nlcv, ALL, Cairo, Rgraphviz, GOstats", "License": "GPL-3", "MD5sum": "cc696d3373a9f258d293f2d966da11d5", "NeedsCompilation": "no", "Title": "Automated Affymetrix Array Analysis Umbrella Package", "Description": "Umbrella package is available for the entire Automatedn Affymetrix Array Analysis suite of package.", "biocViews": "Microarray", "Author": "Willem Talloen [aut], Tobias Verbeke [aut], Laure Cougnaudn [cre]", "Maintainer": "Laure Cougnaud <laure.cougnaud@openanalytics.eu>", "git_url": "https://git.bioconductor.org/packages/a4", "git_branch": "RELEASE_3_15", "git_last_commit": "5b0fc5a", "git_last_commit_date": "2022-04-26", "Date/Publication": "2022-04-26", "source.ver": "src/contrib/a4_1.44.0.tar.gz", "win.binary.ver": "bin/windows/contrib/4.2/a4_1.44.0.zip", "mac.binary.ver": "bin/macosx/contrib/4.2/a4_1.44.0.tgz", "vignettes": "vignettes/a4/inst/doc/a4vignette.pdf", "vignetteTitles": "a4vignette", "hasREADME": "FALSE", "hasNEWS": "TRUE", "hasINSTALL": "FALSE", "hasLICENSE": "FALSE", "Rfiles": "vignettes/a4/inst/doc/a4vignette.R", "dependencyCount": "82" }, { "Package": "a4Base", "Version": "1.44.0", "Depends": "a4Preproc, a4Core", "Imports": "methods, graphics, grid, Biobase, annaffy, mpm, genefilter,n limma, multtest, glmnet, gplots", "Suggests": "Cairo, ALL, hgu95av2.db, nlcv", "Enhances": "gridSVG, JavaGD", "License": "GPL-3", "MD5sum": "094c0a1c87b18ff8f16a3dbe4d06da64", "NeedsCompilation": "no", "Title": "Automated Affymetrix Array Analysis Base Package", "Description": "Base utility functions are available for the Automatedn Affymetrix Array Analysis set of packages.", "biocViews": "Microarray", "Author": "Willem Talloen [aut], Tine Casneuf [aut], An De Bondt [aut],n Steven Osselaer [aut], Hinrich Goehlmann [aut], Willemn Ligtenberg [aut], Tobias Verbeke [aut], Laure Cougnaud [cre]", "Maintainer": "Laure Cougnaud <laure.cougnaud@openanalytics.eu>", "git_url": "https://git.bioconductor.org/packages/a4Base", "git_branch": "RELEASE_3_15", "git_last_commit": "9ae69e0", "git_last_commit_date": "2022-04-26", "Date/Publication": "2022-04-26", "source.ver": "src/contrib/a4Base_1.44.0.tar.gz", "win.binary.ver": "bin/windows/contrib/4.2/a4Base_1.44.0.zip", "mac.binary.ver": "bin/macosx/contrib/4.2/a4Base_1.44.0.tgz", "hasREADME": "FALSE", "hasNEWS": "TRUE", "hasINSTALL": "FALSE", "hasLICENSE": "FALSE", "dependsOnMe": "a4", "suggestsMe": "epimutacions", "dependencyCount": "73" }, ...and so on.