In [8]:
import pandas as pd
import altair as alt
artefacts_data = pd.read_csv("artefacts-overview-stats.csv")
records_data = pd.read_csv("records-institution-stats-total.csv")

In [9]:
import pandas as pd
artefacts_type_df = pd.read_csv("artefacts-type-overview-stats.csv")
records_type_df = pd.read_csv("records-institution-stats-type-total.csv")
combined_art_record_type = pd.merge(artefacts_type_df, records_type_df, on=['institution', 'type'])

# Cataloguing

This is an attempt to give an overview of what has been catalogued in a collection. As this requires combining uncertain artefacts statistics with uncertain catalogued record statistics, there is a lot of uncertainty here. One issue that will be obvious from some very high cataloguing percentages below (over 1000%!) is cases where we have an estimated low number of artefacts given and then a high known number of records. We could perhaps safely make the assumption that the number of artefacts is always at least the number of records, but this would then show up as a collection being 100% catalogued, which is likely not true. 


## Artefact and Record counts

In [10]:
title = alt.TitleParams('Comparision of artefact and record counts', anchor='middle')
alt.Chart(combined_art_record_type,title=title).mark_bar().encode(
    alt.Color('institution:N', sort='descending', legend=alt.Legend(orient='bottom',columns=4)),
    x='count:Q',
    tooltip=['institution', 'artefact_count', 'record_count'],
    y='type:N',
).properties(width=800).resolve_scale(x='independent').transform_fold(
    as_=['type', 'count'],
    fold=['artefact_count', 'record_count']).configure(numberFormat='.2s')

In [12]:
art_records = pd.merge(artefacts_data, records_data, on="institution")

## Artefacts Catalogued

This is a very rough attempt to give a percentage of the extent to which a collection has been digitised. The figures are based on personal interpretation piled on personal interpretation
so should not be used. The errors in this chart can be seen by the x axis extending to 6000% which should give an indication of 
how untrustworthy it is.

In [13]:
import altair as alt

title = alt.TitleParams('Catalogued Artefacts (overall percentage)', anchor='middle')
alt.Chart(art_records, title=title).mark_bar().encode(
    alt.X('percent_catalogued:Q').axis(format='.0%'),
    tooltip="percent_catalogued:Q",
    y='institution:N'
).transform_calculate(
    percent_catalogued="datum.record_count / datum.artefact_count"
)

In [65]:
import pandas as pd
artefacts_type_df = pd.read_csv("artefacts-type-overview-stats.csv")
records_type_df = pd.read_csv("records-institution-stats-type-total.csv")
combined_art_record_type = pd.merge(artefacts_type_df, records_type_df, on=['institution', 'type'])


## Catalogued Artefacts by institution and cataloguing type

In [66]:
title = alt.TitleParams('Catalogued Artefacts by institution and type)', anchor='middle')
alt.Chart(combined_art_record_type, width=400, title=title, height=alt.Step(8)).mark_bar().encode(
    alt.Y("type:N").title(None),
    alt.Tooltip('percent_catalogued:Q'),
    alt.X("percent_catalogued:Q").title(None).axis(format='.0%'),
    alt.Color("topic:N").title("settings").legend(orient="bottom", titleOrient="left"),
    alt.Row("institution:N").title("Institution").header(labelAngle=0, labelAlign='left'),
).resolve_scale(x='independent').transform_calculate(
    percent_catalogued="datum.record_count / datum.artefact_count"
)

## Published Collection Records

In [67]:
records_data = pd.read_csv("records-institution-stats-type.csv")

In [14]:

title = alt.TitleParams('Overall Published Collection Records', anchor='middle')
alt.Chart(records_data, width=100, title=title).mark_bar().encode(
    column='precision:O',
    x='record_count:Q',
    y='institution:N',
    tooltip='record_count:Q',
    color=alt.Color('type:N', sort='descending').legend(orient="top", titleOrient="left")
).properties(width=400).resolve_scale(x='independent').configure(numberFormat='.2s')

In [69]:
# Show progress bar for artefacts vs records

progress_df = pd.merge(artefacts_data,records_data[['institution','type', 'record_count']],on=['institution', 'type'], how='left')

## Cataloguing Progress

In [72]:
title = alt.TitleParams('Cataloguing Progress', anchor='middle')
alt.Chart(progress_df, title=title).mark_bar().encode(
    x='progress:Q',
    tooltip='progress:Q',
    y='institution:N',
).properties(width=400).resolve_scale(x='independent').transform_calculate(
    progress='datum.record_count / datum.artefact_count'
).configure(numberFormat='.2%')