Source code for pv_evaluation.benchmark.report

from datetime import datetime

import numpy as np
import pandas as pd
import plotly.express as px
from er_evaluation.estimators import (estimates_table,
                                      pairwise_precision_design_estimate,
                                      pairwise_recall_design_estimate)
from er_evaluation.metrics import (metrics_table, pairwise_precision,
                                   pairwise_recall)
from er_evaluation.plots import (compare_plots,
                                 plot_cluster_sizes_distribution,
                                 plot_entropy_curve)
from er_evaluation.summary import (cluster_sizes, homonimy_rate, matching_rate,
                                   name_variation_rate)
from er_evaluation.utils import expand_grid

from pv_evaluation.benchmark import (load_als_inventors_benchmark,
                                     load_binette_2022_inventors_benchmark,
                                     load_ens_inventors_benchmark,
                                     load_israeli_inventors_benchmark,
                                     load_lai_2011_inventors_benchmark,
                                     load_patentsview_inventors_benchmark)

DEFAULT_ESTIMATORS = {
    # Point estimates and standard deviation estimates.
    "pairwise precision": pairwise_precision_design_estimate,
    "pairwise recall": pairwise_recall_design_estimate,
}
DEFAULT_INVENTORS_SAMPLES_WEIGHTS = {
    # Dataset and parameters to pass to the estimator.
    "binette-sample": {
        "sample": load_binette_2022_inventors_benchmark(),
        "weights": 1 / cluster_sizes(load_binette_2022_inventors_benchmark()),
    },
}
# Default benchmarks to run.
DEFAULT_INVENTORS_BENCHMARKS = {
    "patentsview-inventors": load_patentsview_inventors_benchmark(),
    "israeli-inventors": load_israeli_inventors_benchmark(),
    "lai-benchmark": load_lai_2011_inventors_benchmark(),
    "als-benchmark": load_als_inventors_benchmark(),
    "ens-benchmark": load_ens_inventors_benchmark(),
    "binette-benchmark": load_binette_2022_inventors_benchmark(),
}
DEFAULT_METRICS = {
    "pairwise precision": pairwise_precision,
    "pairwise recall": pairwise_recall,
}


[docs]def inventor_summary_trend_plot(persistent_inventor, names): """Plot key performance metrics over time. Args: persisten_inventor (DataFrame): String-valued DataFrame in the format of PatentsView's bulk data download file "g_persistent_inventor.tsv". This should contain the columns "patent_id", "sequence", as well as columns with names of the form "disamb_inventor_id_YYYYMMDD" for inventor IDs corresponding to the given disambiguation date. names (Series): pandas Series indexed by mention IDs and with values corresponding to mentioned inventor name. Returns: Plotly scatter plot of the matching rate, homonymy rate, and name variation rate. """ persistent_inventor["mention_id"] = "US" + persistent_inventor.patent_id + "-" + persistent_inventor.sequence persistent_inventor.set_index("mention_id", inplace=True) disambiguation_names = [s for s in persistent_inventor.columns.values if s.startswith("disamb")] disambiguations = {s.lstrip("disamb_inventor_id_"): persistent_inventor[s].dropna() for s in disambiguation_names} metrics = { "Matching rate": lambda x: matching_rate(x), "Homonimy rate": lambda x: homonimy_rate(x, names), "Name variation rate": lambda x: name_variation_rate(x, names), } data = expand_grid(disambiguation=disambiguations, metric=metrics) data["value"] = data.apply(lambda x: metrics[x.metric](disambiguations[x.disambiguation]), axis=1) data["date"] = data["date"] = pd.to_datetime([datetime.strptime(d, "%Y%m%d") for d in data["disambiguation"]]) fig = px.line( data, y="value", x="date", color="metric", symbol="metric", color_discrete_sequence=px.colors.qualitative.Vivid, ) fig.update_layout(yaxis_range=(0, 1)) return fig
[docs]def inventor_estimates_trend_plot(persistent_inventor, samples_weights=None, estimators=None, **kwargs): """Plot performance estimates over time. Note: The timeframe for the disambiguation should match the timeframe considered by the reference sample. Args: persisten_inventor (DataFrame): String-valued DataFrame in the format of PatentsView's bulk data download file "g_persistent_inventor.tsv". This should contain the columns "patent_id", "sequence", as well as columns with names of the form "disamb_inventor_id_YYYYMMDD" for inventor IDs corresponding to the given disambiguation date. samples (dict): Dictionary of tuples (A, B), where A is a function to load a dataset and B is a dictionary of parameters to pass to estimator functions. See `INVENTORS_SAMPLES` for an example. estimators (dict, optional): Dictionary of tuples (A, B) where A is a point estimator and B is a standard deviation estimator. Defaults to DEFAULT_ESTIMATORS. Returns: Plotly scatter plot """ if estimators is None: estimators = DEFAULT_ESTIMATORS if samples_weights is None: samples_weights = DEFAULT_INVENTORS_SAMPLES_WEIGHTS persistent_inventor["mention_id"] = "US" + persistent_inventor.patent_id + "-" + persistent_inventor.sequence persistent_inventor.set_index("mention_id", inplace=True) disambiguation_names = [s for s in persistent_inventor.columns.values if s.startswith("disamb")] disambiguations = {s.lstrip("disamb_inventor_id_"): persistent_inventor[s].dropna() for s in disambiguation_names} computed_metrics = estimates_table(disambiguations, samples_weights=samples_weights, estimators=estimators) computed_metrics["date"] = pd.to_datetime([datetime.strptime(d, "%Y%m%d") for d in computed_metrics["prediction"]]) return px.line( computed_metrics, y="value", x="date", error_y="std", color="estimator", symbol="estimator", color_discrete_sequence=px.colors.qualitative.Vivid, )
[docs]def inventor_estimates_plot(disambiguations, samples_weights=None, estimators=None, facet_col_wrap=2, **kwargs): """Plot performance estimates for given cluster samples. Note: The timeframe for the disambiguation should match the timeframe considered by the reference sample. Args: disambiguations (dict): dictionary of disambiguation results (disambiguation results are pandas Series with "mention_id" index and cluster assignment values). Note that the disambiguated population should match the population from which `samples` have been drawn. For instance, if using the Israeli benchmark dataset which covers granted patents between granted between 1963 and 1999, then `disambiguations` should be subsetted to the same time period. samples (dict): Dictionary of tuples (A, B), where A is a function to load a dataset and B is a dictionary of parameters to pass to estimator functions. See `INVENTORS_SAMPLES` for an example. estimators (dict, optional): Dictionary of tuples (A, B) where A is a point estimator and B is a standard deviation estimator. Defaults to DEFAULT_ESTIMATORS. Returns: Plotly bar chart """ if estimators is None: estimators = DEFAULT_ESTIMATORS if samples_weights is None: samples_weights = DEFAULT_INVENTORS_SAMPLES_WEIGHTS computed_metrics = estimates_table(disambiguations, samples_weights=samples_weights, estimators=estimators) return px.bar( computed_metrics, y="value", x="estimator", error_y="std", color="prediction", facet_col="sample_weights", barmode="group", facet_col_wrap=facet_col_wrap, **kwargs, )
[docs]def inventor_benchmark_plot(predictions, references=None, metrics=None, facet_col_wrap=2, **kwargs): """Bar plot of performance evaluation metrics on benchmark datasets. Args: disambiguations (dict): dictionary of disambiguation results (disambiguation results are pandas Series with "mention_id" index and cluster assignment values). metrics (dict, optional): dictionary of metrics (from the metrics submodule) to compute. Defaults to `DEFAULT_METRICS`. benchmarks (dict, optional): benchmark datasets loading functions (from the benchmark submodule) to use. Defaults to `DEFAULT_BENCHMARK`. Returns: plotly graph object """ if references is None: references = DEFAULT_INVENTORS_BENCHMARKS if metrics is None: metrics = DEFAULT_METRICS computed_metrics = metrics_table(predictions, references, metrics) return px.bar( computed_metrics, y="value", x="metric", color="prediction", facet_col="reference", barmode="group", facet_col_wrap=facet_col_wrap, **kwargs, )
[docs]def style_cluster_inspection(table, by="prediction"): """Style table to highlight groups with alternating colors. Args: table (dataframe): DataFrame to style. by (str, optional): column to color by. Defaults to "prediction". """ def format_color_groups(df): # From https://datascientyst.com/pandas-dataframe-background-color-based-condition-value-alternate-row-color-based-group/ colors = ["white", "#c5dcf5"] x = df.copy() factors = list(x[by].unique()) i = 0 for factor in factors: style = f"background-color: {colors[i]}" x.loc[x[by] == factor, :] = style i = not i return x return table.style.apply(format_color_groups, axis=None)
def add_links(table, type="patentsview"): """Add Google Patents links to table with mention IDs as index. Args: table (DataFrame): pandas DataFrame with mention IDs as an index. Returns: DataFrame: table with added Google Patents links. """ if len(table) > 0: patent_codes = table.index.str.split("-", expand=True).droplevel(1).str.lstrip("US").values table["link"] = [ f"<a class='previewbox-anchor' href='https://datatool.patentsview.org/#detail/patent/{x}'>🔗</a>" for x in patent_codes ] return table
[docs]def inspect_clusters_to_split(disambiguation, benchmark, join_with=None, links=False): """Get table of cluster assignment errors on the given benchmark. Args: disambiguation (Series): disambiguation result Series (disambiguation results are pandas Series with "mention_id" index and cluster assignment values). benchmark (Series): reference disambiguation Series. join_with (DataFrame, optional): DataFrame indexed by "mention_id". Defaults to None. Returns: DataFrame: DataFrame containing erroneous cluster assignments according to the given benchmark. """ data = pd.concat({"prediction": disambiguation, "reference": benchmark}, axis=1, join="inner") clusters_to_split = ( data.join(data.groupby("prediction").nunique()["reference"].rename("ref_count"), on="prediction") .query("ref_count > 1") .sort_values("reference") .sort_values("prediction") .drop("ref_count", axis=1) ) if join_with is not None: table = clusters_to_split.join(join_with, rsuffix="_joined") else: table = clusters_to_split if links: table = add_links(table) return table
[docs]def inspect_clusters_to_merge(disambiguation, benchmark, join_with=None, links=False): """Get table to inspect missing cluster links given a benchmark dataset. Args: disambiguation (Series): disambiguation result Series (disambiguation results are pandas Series with "mention_id" index and cluster assignment values). benchmark (Series): reference disambiguation Series. join_with (DataFrame, optional): DataFrame indexed by "mention_id". Defaults to None. Returns: DataFrame: DataFrame containing missing cluster links according to the given benchmark. """ clusters_to_merge = inspect_clusters_to_split(benchmark, disambiguation) clusters_to_merge.rename(columns={"prediction": "reference", "reference": "prediction"}, inplace=True) if join_with is not None: table = clusters_to_merge.join(join_with, rsuffix="_joined") else: table = clusters_to_merge if links: table = add_links(table) return table
[docs]def top_inventors(disambiguation, names, n=10): """ Table of most prolific inventors Args: disambiguation (Series): Membership vector, indexed by mention IDs, representing a given disambiguation. names (Series): Pandas Series indexed by mention IDs and with values corresponding to inventor name. n (int, optional): Number of rows to display. Defaults to 10. Returns: DataFrame: Table with top n most prolific inventors. """ largest = cluster_sizes(disambiguation).sort_values(ascending=False).head(n) largest_mentions = disambiguation[np.isin(disambiguation.values, largest.index.values)] largest_with_names = pd.merge(largest_mentions, names, how="left", left_index=True, right_index=True) return largest_with_names.groupby(disambiguation.name).first()
[docs]def plot_entropy_curves(disambiguations): """ Plot entropy curves for a set of disambiguations Args: disambiguations (Dict): Dictionary of membership vectors representing given disambiguations Returns: Plotly figure. """ fig = compare_plots(*[plot_entropy_curve(d, name=k) for k, d in disambiguations.items()]) fig.update_layout( autosize=False, width=800, title="Hill numbers Curve", xaxis_title="q", yaxis_title="Hill number of order q" ) fig.update_yaxes(autorange=True) return fig
[docs]def plot_cluster_sizes(disambiguations): """ Plot cluster sizes for a set of disambiguations Args: disambiguations (Dict): Dictionary of membership vectors representing given disambiguations. Returns: Plotly figure. """ fig = compare_plots(*[plot_cluster_sizes_distribution(d, name=k, normalize=True) for k, d in disambiguations.items()]) fig.update_layout( autosize=False, width=800, title="Cluster Sizes Distribution", xaxis_title="Cluster size", yaxis_title="Proportion" ) fig.update_xaxes(range=(0, 20)) fig.update_yaxes(autorange=True) return fig
[docs]def plot_name_variation_rates(disambiguations, names): """ Plot name variation rates for a set of given disambiguations Args: disambiguations (Dict): Dictionary of membership vectors representing given disambiguations. names (Series): Pandas Series indexed by mention IDs and with values corresponding to inventor name. Returns: Plotly figure. """ rates = [name_variation_rate(disambiguations[f], names=names) for f in disambiguations.keys()] data = pd.DataFrame({"Name variation rate": rates, "Disambiguation": disambiguations.keys(), "none": ""}) fig = px.bar(data, x="none", y="Name variation rate", color="Disambiguation", barmode="group") fig.update_layout(title="Name variation rate", xaxis_title="") fig.update_yaxes(range=(0, 1)) return fig
[docs]def plot_homonimy_rates(disambiguations, names): """ Plot homonimy rates for a set of given disambiguations Args: disambiguations (Dict): Dictionary of membership vectors representing given disambiguations. names (Series): Pandas Series indexed by mention IDs and with values corresponding to inventor name. Returns: Plotly figure. """ rates = [homonimy_rate(disambiguations[f], names=names) for f in disambiguations.keys()] data = pd.DataFrame({"Homonimy rate": rates, "Disambiguation": disambiguations.keys(), "none": ""}) fig = px.bar(data, x="none", y="Homonimy rate", color="Disambiguation", barmode="group") fig.update_layout(title="Homonimy rate", xaxis_title="") fig.update_yaxes(range=(0, 1)) return fig