Source code for pv_evaluation.benchmark.report

from datetime import datetime

import numpy as np
import pandas as pd
import plotly.express as px
from er_evaluation.estimators import (estimates_table,
                                      pairwise_precision_design_estimate,
                                      pairwise_recall_design_estimate)
from er_evaluation.metrics import (metrics_table, pairwise_precision,
                                   pairwise_recall)
from er_evaluation.plots import (compare_plots,
                                 plot_cluster_sizes_distribution,
                                 plot_entropy_curve)
from er_evaluation.summary import (cluster_sizes, homonimy_rate, matching_rate,
                                   name_variation_rate)
from er_evaluation.utils import expand_grid

from pv_evaluation.benchmark import (load_als_inventors_benchmark,
                                     load_binette_2022_inventors_benchmark,
                                     load_ens_inventors_benchmark,
                                     load_israeli_inventors_benchmark,
                                     load_lai_2011_inventors_benchmark,
                                     load_patentsview_inventors_benchmark)

DEFAULT_ESTIMATORS = {
    # Point estimates and standard deviation estimates.
    "pairwise precision": pairwise_precision_design_estimate,
    "pairwise recall": pairwise_recall_design_estimate,
}
DEFAULT_INVENTORS_SAMPLES_WEIGHTS = {
    # Dataset and parameters to pass to the estimator.
    "binette-sample": {
        "sample": load_binette_2022_inventors_benchmark(),
        "weights": 1 / cluster_sizes(load_binette_2022_inventors_benchmark()),
    },
}
# Default benchmarks to run.
DEFAULT_INVENTORS_BENCHMARKS = {
    "patentsview-inventors": load_patentsview_inventors_benchmark(),
    "israeli-inventors": load_israeli_inventors_benchmark(),
    "lai-benchmark": load_lai_2011_inventors_benchmark(),
    "als-benchmark": load_als_inventors_benchmark(),
    "ens-benchmark": load_ens_inventors_benchmark(),
    "binette-benchmark": load_binette_2022_inventors_benchmark(),
}
DEFAULT_METRICS = {
    "pairwise precision": pairwise_precision,
    "pairwise recall": pairwise_recall,
}


[docs]def inventor_summary_trend_plot(persistent_inventor, names):
    """Plot key performance metrics over time.

    Args:
        persisten_inventor (DataFrame): String-valued DataFrame in the format of PatentsView's bulk data download file "g_persistent_inventor.tsv". This should contain the columns "patent_id", "sequence", as well as columns with names of the form "disamb_inventor_id_YYYYMMDD" for inventor IDs corresponding to the given disambiguation date.
        names (Series): pandas Series indexed by mention IDs and with values corresponding to mentioned inventor name.

    Returns:
        Plotly scatter plot of the matching rate, homonymy rate, and name variation rate.
    """
    persistent_inventor["mention_id"] = "US" + persistent_inventor.patent_id + "-" + persistent_inventor.sequence
    persistent_inventor.set_index("mention_id", inplace=True)

    disambiguation_names = [s for s in persistent_inventor.columns.values if s.startswith("disamb")]
    disambiguations = {s.lstrip("disamb_inventor_id_"): persistent_inventor[s].dropna() for s in disambiguation_names}

    metrics = {
        "Matching rate": lambda x: matching_rate(x),
        "Homonimy rate": lambda x: homonimy_rate(x, names),
        "Name variation rate": lambda x: name_variation_rate(x, names),
    }

    data = expand_grid(disambiguation=disambiguations, metric=metrics)
    data["value"] = data.apply(lambda x: metrics[x.metric](disambiguations[x.disambiguation]), axis=1)

    data["date"] = data["date"] = pd.to_datetime([datetime.strptime(d, "%Y%m%d") for d in data["disambiguation"]])

    fig = px.line(
        data,
        y="value",
        x="date",
        color="metric",
        symbol="metric",
        color_discrete_sequence=px.colors.qualitative.Vivid,
    )
    fig.update_layout(yaxis_range=(0, 1))

    return fig


[docs]def inventor_estimates_trend_plot(persistent_inventor, samples_weights=None, estimators=None, **kwargs):
    """Plot performance estimates over time.

    Note:
        The timeframe for the disambiguation should match the timeframe considered by the reference sample.

    Args:
        persisten_inventor (DataFrame): String-valued DataFrame in the format of PatentsView's bulk data download file "g_persistent_inventor.tsv". This should contain the columns "patent_id", "sequence", as well as columns with names of the form "disamb_inventor_id_YYYYMMDD" for inventor IDs corresponding to the given disambiguation date.
        samples (dict): Dictionary of tuples (A, B), where A is a function to load a dataset and B is a dictionary of parameters to pass to estimator functions. See `INVENTORS_SAMPLES` for an example.
        estimators (dict, optional): Dictionary of tuples (A, B) where A is a point estimator and B is a standard deviation estimator. Defaults to DEFAULT_ESTIMATORS.

    Returns:
        Plotly scatter plot
    """
    if estimators is None:
        estimators = DEFAULT_ESTIMATORS
    if samples_weights is None:
        samples_weights = DEFAULT_INVENTORS_SAMPLES_WEIGHTS

    persistent_inventor["mention_id"] = "US" + persistent_inventor.patent_id + "-" + persistent_inventor.sequence
    persistent_inventor.set_index("mention_id", inplace=True)

    disambiguation_names = [s for s in persistent_inventor.columns.values if s.startswith("disamb")]
    disambiguations = {s.lstrip("disamb_inventor_id_"): persistent_inventor[s].dropna() for s in disambiguation_names}

    computed_metrics = estimates_table(disambiguations, samples_weights=samples_weights, estimators=estimators)
    computed_metrics["date"] = pd.to_datetime([datetime.strptime(d, "%Y%m%d") for d in computed_metrics["prediction"]])

    return px.line(
        computed_metrics,
        y="value",
        x="date",
        error_y="std",
        color="estimator",
        symbol="estimator",
        color_discrete_sequence=px.colors.qualitative.Vivid,
    )


[docs]def inventor_estimates_plot(disambiguations, samples_weights=None, estimators=None, facet_col_wrap=2, **kwargs):
    """Plot performance estimates for given cluster samples.

    Note:
        The timeframe for the disambiguation should match the timeframe considered by the reference sample.

    Args:
        disambiguations (dict): dictionary of disambiguation results (disambiguation results are pandas Series with "mention_id" index and cluster assignment values).
            Note that the disambiguated population should match the population from which `samples` have been drawn. For instance, if using the Israeli benchmark dataset
            which covers granted patents between granted between 1963 and 1999, then `disambiguations` should be subsetted to the same time period.
        samples (dict): Dictionary of tuples (A, B), where A is a function to load a dataset and B is a dictionary of parameters to pass to estimator functions. See `INVENTORS_SAMPLES` for an example.
        estimators (dict, optional): Dictionary of tuples (A, B) where A is a point estimator and B is a standard deviation estimator. Defaults to DEFAULT_ESTIMATORS.

    Returns:
        Plotly bar chart
    """
    if estimators is None:
        estimators = DEFAULT_ESTIMATORS
    if samples_weights is None:
        samples_weights = DEFAULT_INVENTORS_SAMPLES_WEIGHTS

    computed_metrics = estimates_table(disambiguations, samples_weights=samples_weights, estimators=estimators)
    return px.bar(
        computed_metrics,
        y="value",
        x="estimator",
        error_y="std",
        color="prediction",
        facet_col="sample_weights",
        barmode="group",
        facet_col_wrap=facet_col_wrap,
        **kwargs,
    )


[docs]def inventor_benchmark_plot(predictions, references=None, metrics=None, facet_col_wrap=2, **kwargs):
    """Bar plot of performance evaluation metrics on benchmark datasets.

    Args:
        disambiguations (dict): dictionary of disambiguation results (disambiguation results are pandas Series with "mention_id" index and cluster assignment values).
        metrics (dict, optional): dictionary of metrics (from the metrics submodule) to compute. Defaults to `DEFAULT_METRICS`.
        benchmarks (dict, optional): benchmark datasets loading functions (from the benchmark submodule) to use. Defaults to `DEFAULT_BENCHMARK`.

    Returns:
        plotly graph object
    """
    if references is None:
        references = DEFAULT_INVENTORS_BENCHMARKS
    if metrics is None:
        metrics = DEFAULT_METRICS

    computed_metrics = metrics_table(predictions, references, metrics)
    return px.bar(
        computed_metrics,
        y="value",
        x="metric",
        color="prediction",
        facet_col="reference",
        barmode="group",
        facet_col_wrap=facet_col_wrap,
        **kwargs,
    )


[docs]def style_cluster_inspection(table, by="prediction"):
    """Style table to highlight groups with alternating colors.

    Args:
        table (dataframe): DataFrame to style.
        by (str, optional): column to color by. Defaults to "prediction".
    """

    def format_color_groups(df):
        # From https://datascientyst.com/pandas-dataframe-background-color-based-condition-value-alternate-row-color-based-group/
        colors = ["white", "#c5dcf5"]
        x = df.copy()
        factors = list(x[by].unique())
        i = 0
        for factor in factors:
            style = f"background-color: {colors[i]}"
            x.loc[x[by] == factor, :] = style
            i = not i
        return x

    return table.style.apply(format_color_groups, axis=None)


def add_links(table, type="patentsview"):
    """Add Google Patents links to table with mention IDs as index.

    Args:
        table (DataFrame): pandas DataFrame with mention IDs as an index.

    Returns:
        DataFrame: table with added Google Patents links.
    """

    if len(table) > 0:
        patent_codes = table.index.str.split("-", expand=True).droplevel(1).str.lstrip("US").values
        table["link"] = [
            f"<a class='previewbox-anchor' href='https://datatool.patentsview.org/#detail/patent/{x}'>🔗</a>"
            for x in patent_codes
        ]
    return table


[docs]def inspect_clusters_to_split(disambiguation, benchmark, join_with=None, links=False):
    """Get table of cluster assignment errors on the given benchmark.

    Args:
        disambiguation (Series): disambiguation result Series (disambiguation results are pandas Series with "mention_id" index and cluster assignment values).
        benchmark (Series): reference disambiguation Series.
        join_with (DataFrame, optional): DataFrame indexed by "mention_id". Defaults to None.

    Returns:
        DataFrame: DataFrame containing erroneous cluster assignments according to the given benchmark.
    """
    data = pd.concat({"prediction": disambiguation, "reference": benchmark}, axis=1, join="inner")
    clusters_to_split = (
        data.join(data.groupby("prediction").nunique()["reference"].rename("ref_count"), on="prediction")
        .query("ref_count > 1")
        .sort_values("reference")
        .sort_values("prediction")
        .drop("ref_count", axis=1)
    )

    if join_with is not None:
        table = clusters_to_split.join(join_with, rsuffix="_joined")
    else:
        table = clusters_to_split

    if links:
        table = add_links(table)

    return table


[docs]def inspect_clusters_to_merge(disambiguation, benchmark, join_with=None, links=False):
    """Get table to inspect missing cluster links given a benchmark dataset.

    Args:
        disambiguation (Series): disambiguation result Series (disambiguation results are pandas Series with "mention_id" index and cluster assignment values).
        benchmark (Series): reference disambiguation Series.
        join_with (DataFrame, optional): DataFrame indexed by "mention_id". Defaults to None.

    Returns:
        DataFrame: DataFrame containing missing cluster links according to the given benchmark.
    """
    clusters_to_merge = inspect_clusters_to_split(benchmark, disambiguation)
    clusters_to_merge.rename(columns={"prediction": "reference", "reference": "prediction"}, inplace=True)

    if join_with is not None:
        table = clusters_to_merge.join(join_with, rsuffix="_joined")
    else:
        table = clusters_to_merge

    if links:
        table = add_links(table)

    return table


[docs]def top_inventors(disambiguation, names, n=10):
    """
    Table of most prolific inventors

    Args:
        disambiguation (Series): Membership vector, indexed by mention IDs, representing a given disambiguation.
        names (Series): Pandas Series indexed by mention IDs and with values corresponding to inventor name.
        n (int, optional): Number of rows to display. Defaults to 10.

    Returns:
        DataFrame: Table with top n most prolific inventors.
    """
    largest = cluster_sizes(disambiguation).sort_values(ascending=False).head(n)
    largest_mentions = disambiguation[np.isin(disambiguation.values, largest.index.values)]
    largest_with_names = pd.merge(largest_mentions, names, how="left", left_index=True, right_index=True)

    return largest_with_names.groupby(disambiguation.name).first()


[docs]def plot_entropy_curves(disambiguations):
    """
    Plot entropy curves for a set of disambiguations

    Args:
        disambiguations (Dict): Dictionary of membership vectors representing given disambiguations

    Returns:
        Plotly figure.
    """
    fig = compare_plots(*[plot_entropy_curve(d, name=k) for k, d in disambiguations.items()])
    fig.update_layout(
        autosize=False, width=800, title="Hill numbers Curve", xaxis_title="q", yaxis_title="Hill number of order q"
    )
    fig.update_yaxes(autorange=True)
    return fig


[docs]def plot_cluster_sizes(disambiguations):
    """
    Plot cluster sizes for a set of disambiguations

    Args:
        disambiguations (Dict): Dictionary of membership vectors representing given disambiguations.

    Returns:
        Plotly figure.
    """
    fig = compare_plots(*[plot_cluster_sizes_distribution(d, name=k, normalize=True) for k, d in disambiguations.items()])
    fig.update_layout(
        autosize=False, width=800, title="Cluster Sizes Distribution", xaxis_title="Cluster size", yaxis_title="Proportion"
    )
    fig.update_xaxes(range=(0, 20))
    fig.update_yaxes(autorange=True)
    return fig


[docs]def plot_name_variation_rates(disambiguations, names):
    """
    Plot name variation rates for a set of given disambiguations

    Args:
        disambiguations (Dict): Dictionary of membership vectors representing given disambiguations.
        names (Series): Pandas Series indexed by mention IDs and with values corresponding to inventor name.

    Returns:
        Plotly figure.
    """
    rates = [name_variation_rate(disambiguations[f], names=names) for f in disambiguations.keys()]

    data = pd.DataFrame({"Name variation rate": rates, "Disambiguation": disambiguations.keys(), "none": ""})

    fig = px.bar(data, x="none", y="Name variation rate", color="Disambiguation", barmode="group")
    fig.update_layout(title="Name variation rate", xaxis_title="")
    fig.update_yaxes(range=(0, 1))
    return fig


[docs]def plot_homonimy_rates(disambiguations, names):
    """
    Plot homonimy rates for a set of given disambiguations

    Args:
        disambiguations (Dict): Dictionary of membership vectors representing given disambiguations.
        names (Series): Pandas Series indexed by mention IDs and with values corresponding to inventor name.

    Returns:
        Plotly figure.
    """

    rates = [homonimy_rate(disambiguations[f], names=names) for f in disambiguations.keys()]
    data = pd.DataFrame({"Homonimy rate": rates, "Disambiguation": disambiguations.keys(), "none": ""})

    fig = px.bar(data, x="none", y="Homonimy rate", color="Disambiguation", barmode="group")
    fig.update_layout(title="Homonimy rate", xaxis_title="")
    fig.update_yaxes(range=(0, 1))
    return fig