Module `classical_atlas.topos_wrangler`

Expand source code

import json
import os
from bs4 import BeautifulSoup
import csv
from collections import defaultdict


def load_json(file_name):
    """
    Load a JSON file as a Python dataframe.

    Parameters
    ----------
    file_name : string
        file name of the target JSON file

    Returns
    -------
    dataframe
        A dataframe representing the JSON file
    """
    file = open(file_name, "r+", encoding="utf8")
    df = json.load(file)
    return df


def get_topos_data():
    """
    Loads the Topos Text Gazeteer data.

    Returns
    -------
    dataframe
        A dataframe with data from the Topos Text Gazeteer
    """
    return load_json("data/ToposTextGazetteer.jsonld")


def switch_to_pleiades_ids(topos_df, topos_places):
    """
    Swap all IDs to Pleiades IDs if possible

    Parameters
    ----------
    topos_df : dataframe
        a dataframe with data from the Topos Text Gazeteer

    topos_places : dictionary
        a dictionary with key=texts and value=list of Topos Text IDs

    Returns
    -------
    dictionary
        dictionary with key=texts and value=list of Pleiades IDs, where possible

    Notes
    -----
    This method reports how many IDs were unable to be switched. In the case that there is no
    corresponding Pleiades ID for a given Topos Text ID, the Topos Text ID is retained in the dictionary.
    """
    ids = _topos_pleiades_ids(topos_df)
    switch_counter = 0
    did_not_switch = 0
    for place in topos_places.keys():
        for i in range(len(topos_places[place])):
            topos_id = topos_places[place][i]
            if topos_id in ids.keys():
                topos_places[place][i] = ids[topos_id]
                switch_counter += 1
            else:
                did_not_switch += 1
    print("Made " + str(switch_counter) + " switches.")
    print("No Pleiades ID available for " + str(did_not_switch) + " topos places.")
    return topos_places


def swap_key_value_pairs(textual_refs):
    """
    Reorganize the dictionary with key=ID and value= list of texts

    Parameters
    ----------
    textual_refs : dictionary
        dictionary with key=text and value=ID

    Returns
    -------
    dictionary
        dictionary with key=ID and value=list of texts
    """
    reorganized = defaultdict(list)
    for text in textual_refs.keys():
        for id in textual_refs[text]:
            reorganized[id].append(text)
    for i in reorganized.keys():
        set(reorganized[i])
    return reorganized


def parse_topos_place_refs():
    """
    Create a dictionary from the Topos Text .csv file

    Returns
    -------
    dictionary
        A dictionary with key=text and value=list of Topos Text IDs
    """
    topos_places = defaultdict(list)
    with open('data/topos_data.csv', 'r', encoding="utf8") as topos_csv:
        csv_reader = csv.reader(topos_csv, delimiter=',')
        for row in csv_reader:
            if len(row) > 0:
                for i in range(len(row)):
                    if i == 0:
                        key = row[i]
                        topos_places[key] = []
                    else:
                        topos_places[key].append(row[i])
    return topos_places


def _parse_topos_place_refs_from_all_files(data='data/topos_data/'):
    directory = os.fsencode(data)
    topos_places = defaultdict(list)
    print("Parsing data...please be patient.")
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        # print(filename)
        if filename.endswith(".htm") or filename.endswith(".html"):
            # print("here!")
            name = "data/topos_data/" + filename
            text = open(name, "r", encoding="utf8").read()
            soup = BeautifulSoup(text, 'html.parser')
            title = soup.find('meta', property='dc:title', itemprop='name', lang='en').get_text("|")
            elements = title.split("|")
            for element in elements:
                if len(element) > 1:
                    short_title = element
                    # print(short_title)
                    break
            place_refs = soup.find_all('a', {"class": "place"})
            for ref in place_refs:
                link = ref['about'].split("/")[-1]
                topos_places[short_title].append(link)
        else:
            continue
    print("Parsed " + str(len(topos_places)) + " documents.")
    return topos_places


def _write_topos_place_refs_to_csv(topos_place_refs):
    with open('data/topos_data.csv', 'w+', encoding="utf8") as topos_csv:
        # create the csv writer
        writer = csv.writer(topos_csv)
        for item in topos_place_refs.keys():
            row = [item]
            for ref in topos_place_refs[item]:
                row.append(ref)
            writer.writerow(row)


def _topos_pleiades_ids(df, key_selector='topos'):
    topos_pleiades_ids = {}
    pleiades_topos_ids = {}
    for location in range(len(df['features'])):
        pleiades_link = None
        pleiades_id = None
        if 'links' in df['features'][location]:
            if df['features'][location]['links'][0]:
                pleiades_link = df['features'][location]['links'][0]['identifier']
        topos_link = df['features'][location]['@id']
        if pleiades_link and 'pleiades' in pleiades_link:
            pleiades_id = pleiades_link.split("/")[-1]
        topos_id = topos_link.split("/")[-1]
        topos_pleiades_ids[topos_id] = pleiades_id
        if pleiades_id:
            pleiades_topos_ids[pleiades_id] = topos_id
    if key_selector == 'topos':
        return topos_pleiades_ids
    else:
        return pleiades_topos_ids

Functions

def get_topos_data()

Loads the Topos Text Gazeteer data.

Returns

dataframe: A dataframe with data from the Topos Text Gazeteer

Expand source code

def get_topos_data():
    """
    Loads the Topos Text Gazeteer data.

    Returns
    -------
    dataframe
        A dataframe with data from the Topos Text Gazeteer
    """
    return load_json("data/ToposTextGazetteer.jsonld")

def load_json(file_name)

Load a JSON file as a Python dataframe.

Parameters

file_name : string: file name of the target JSON file

Returns

dataframe: A dataframe representing the JSON file

Expand source code

def load_json(file_name):
    """
    Load a JSON file as a Python dataframe.

    Parameters
    ----------
    file_name : string
        file name of the target JSON file

    Returns
    -------
    dataframe
        A dataframe representing the JSON file
    """
    file = open(file_name, "r+", encoding="utf8")
    df = json.load(file)
    return df

def parse_topos_place_refs()

Create a dictionary from the Topos Text .csv file

Returns

dictionary: A dictionary with key=text and value=list of Topos Text IDs

Expand source code

def parse_topos_place_refs():
    """
    Create a dictionary from the Topos Text .csv file

    Returns
    -------
    dictionary
        A dictionary with key=text and value=list of Topos Text IDs
    """
    topos_places = defaultdict(list)
    with open('data/topos_data.csv', 'r', encoding="utf8") as topos_csv:
        csv_reader = csv.reader(topos_csv, delimiter=',')
        for row in csv_reader:
            if len(row) > 0:
                for i in range(len(row)):
                    if i == 0:
                        key = row[i]
                        topos_places[key] = []
                    else:
                        topos_places[key].append(row[i])
    return topos_places

def swap_key_value_pairs(textual_refs)

Reorganize the dictionary with key=ID and value= list of texts

Parameters

textual_refs : dictionary: dictionary with key=text and value=ID

Returns

dictionary: dictionary with key=ID and value=list of texts

Expand source code

def swap_key_value_pairs(textual_refs):
    """
    Reorganize the dictionary with key=ID and value= list of texts

    Parameters
    ----------
    textual_refs : dictionary
        dictionary with key=text and value=ID

    Returns
    -------
    dictionary
        dictionary with key=ID and value=list of texts
    """
    reorganized = defaultdict(list)
    for text in textual_refs.keys():
        for id in textual_refs[text]:
            reorganized[id].append(text)
    for i in reorganized.keys():
        set(reorganized[i])
    return reorganized

def switch_to_pleiades_ids(topos_df, topos_places)

Swap all IDs to Pleiades IDs if possible

Parameters

topos_df : dataframe: a dataframe with data from the Topos Text Gazeteer
topos_places : dictionary: a dictionary with key=texts and value=list of Topos Text IDs

Returns

dictionary: dictionary with key=texts and value=list of Pleiades IDs, where possible

Notes

This method reports how many IDs were unable to be switched. In the case that there is no corresponding Pleiades ID for a given Topos Text ID, the Topos Text ID is retained in the dictionary.

Expand source code

def switch_to_pleiades_ids(topos_df, topos_places):
    """
    Swap all IDs to Pleiades IDs if possible

    Parameters
    ----------
    topos_df : dataframe
        a dataframe with data from the Topos Text Gazeteer

    topos_places : dictionary
        a dictionary with key=texts and value=list of Topos Text IDs

    Returns
    -------
    dictionary
        dictionary with key=texts and value=list of Pleiades IDs, where possible

    Notes
    -----
    This method reports how many IDs were unable to be switched. In the case that there is no
    corresponding Pleiades ID for a given Topos Text ID, the Topos Text ID is retained in the dictionary.
    """
    ids = _topos_pleiades_ids(topos_df)
    switch_counter = 0
    did_not_switch = 0
    for place in topos_places.keys():
        for i in range(len(topos_places[place])):
            topos_id = topos_places[place][i]
            if topos_id in ids.keys():
                topos_places[place][i] = ids[topos_id]
                switch_counter += 1
            else:
                did_not_switch += 1
    print("Made " + str(switch_counter) + " switches.")
    print("No Pleiades ID available for " + str(did_not_switch) + " topos places.")
    return topos_places