Source code for rSpringRank.datasets._fetchers

from ._registry import registry, registry_urls

# import networkx as nx
try:
    import graph_tool.all as gt
except ModuleNotFoundError:
    print("We need graph-tool to load the datasets. Please install graph-tool.")

import linecache
import tempfile
from pathlib import Path

import zstandard as zstd

try:
    import pooch
except ImportError:
    pooch = None
    data_fetcher = None
else:
    data_fetcher = pooch.create(
        # Use the default cache folder for the operating system
        # Pooch uses appdirs (https://github.com/ActiveState/appdirs) to
        # select an appropriate directory for the cache on each platform.
        path=pooch.os_cache("rSpringRank-data"),
        # The remote data is on Github
        # base_url is a required param, even though we override this
        # using individual urls in the registry.
        base_url="https://github.com/junipertcy/",
        registry=registry,
        urls=registry_urls,
    )


def fetch_data(dataset_name, data_fetcher=data_fetcher):
    if data_fetcher is None:
        raise ImportError(
            "Missing optional dependency 'pooch' required "
            "for scipy.datasets module. Please use pip or "
            "conda to install 'pooch'."
        )
    # The "fetch" method returns the full path to the downloaded data file.
    return data_fetcher.fetch(dataset_name)


[docs] def us_air_traffic(): # The file will be downloaded automatically the first time this is run, # returning the path to the downloaded file. Afterwards, Pooch finds # it in the local cache and doesn't repeat the download. fname = fetch_data("us_air_traffic.gt.zst") # Now we just need to load it with our standard Python tools. fname = Path(fname) dctx = zstd.ZstdDecompressor() with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir) gt_fname = temp_dir_path / fname.with_suffix("").name with open(fname, "rb") as ifh, open(gt_fname, "wb") as ofh: dctx.copy_stream(ifh, ofh) graph = gt.load_graph(gt_fname.as_posix()) return graph
[docs] def at_migrations(): fname = fetch_data("at_migrations.gt.zst") fname = Path(fname) dctx = zstd.ZstdDecompressor() with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir) gt_fname = temp_dir_path / fname.with_suffix("").name with open(fname, "rb") as ifh, open(gt_fname, "wb") as ofh: dctx.copy_stream(ifh, ofh) graph = gt.load_graph(gt_fname.as_posix()) return graph
[docs] def PhD_exchange(): fname = fetch_data("PhD_exchange.txt") fname_school_names = fetch_data("school_names.txt") delimiter = " " g = gt.Graph() vname = g.new_vp("string") vindex = g.new_vp("int") eweight = g.new_ep("double") etime = g.new_ep("int") name2id = dict() time2id = dict() nameid = 0 timeid = 0 with open(fname, "r") as f: for line in f: ijwt = line.replace("\n", "").split(delimiter)[:4] try: name2id[ijwt[0]] except KeyError: name2id[ijwt[0]] = nameid nameid += 1 try: name2id[ijwt[1]] except KeyError: name2id[ijwt[1]] = nameid nameid += 1 try: time2id[ijwt[3]] except KeyError: time2id[ijwt[3]] = timeid timeid += 1 g.add_edge_list( [ (name2id[ijwt[1]], name2id[ijwt[0]], ijwt[2], time2id[ijwt[3]]) ], # note the source / target order eprops=[eweight, etime], ) g.edge_properties["eweight"] = eweight g.edge_properties["etime"] = etime id2name = {v: k for k, v in name2id.items()} def school_name(n): return linecache.getline(fname_school_names, n).replace("\n", "")[:-1] # print(school_name(165)) # >> University of Michigan for vertex in g.vertices(): vname[vertex] = school_name(int(id2name[vertex])) # print(vname[vertex], vertex, id2name[vertex]) vindex[vertex] = vertex.__int__() g.vertex_properties["vname"] = vname g.vertex_properties["vindex"] = vindex # print(name2id) return g