carter swartout

github posts notes

Soccer results based on possession


plot of possession vs win/loss/draw ratios

Was chatting with a friend about how possession impacts results in soccer and hacked up a quick plot. Data is sourced from Kaggle.

There’s a clear increase in win-percentage as possession goes up, which would be expected, however draws stay somewhat constant.

- Carter


Code:

from pprint import pprint
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from collections import defaultdict
import numpy as np
import sqlite3
import pandas as pd

conn = sqlite3.connect("data/database.sqlite")
df = pd.read_sql_query("SELECT * FROM Match;", conn)
conn.close()

results = defaultdict(lambda: {'win': 0, 'loss': 0, 'draw': 0})

for _, row in df.iterrows():
    if not row['possession']:
        continue
    for value in ET.fromstring(row['possession']).findall("value"):
        if value.findtext("elapsed") == '90':
            home_pos, away_pos = value.findtext("homepos"), value.findtext("awaypos")
            if not home_pos or not away_pos:
                continue
            home_goals, away_goals = row['home_team_goal'], row['away_team_goal']
            if home_goals > away_goals:
                results[home_pos]['win'] += 1
                results[away_pos]['loss'] += 1
            elif home_goals < away_goals:
                results[home_pos]['loss'] += 1
                results[away_pos]['win'] += 1
            else:
                results[home_pos]['draw'] += 1
                results[away_pos]['draw'] += 1

df = pd.DataFrame.from_dict(results, orient="index").astype(float)
df.index = df.index.astype(int)

full_index = pd.Index(range(20, 81), name="possession")
df = df.reindex(full_index)

df["total_games"] = df.sum(axis=1)
valid_mask = df["total_games"] >= 10
df_ratio = df.div(df[["win", "loss", "draw"]].sum(axis=1), axis=0)
df_ratio[~valid_mask] = np.nan

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True, gridspec_kw={"height_ratios": [3, 1]})

x_positions = np.arange(len(df.index))

order = ["win", "draw", "loss"]
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]
bottom = np.zeros(len(df.index))

for i, col in enumerate(order):
    values = df_ratio[col].fillna(0).values
    ax1.bar(x_positions, values, bottom=bottom, width=1.0, label=col, color=colors[i])
    bottom += values

no_data_mask = df_ratio.isna().all(axis=1)
if no_data_mask.any():
    ax1.bar(x_positions[no_data_mask], [1]*no_data_mask.sum(), 
            color="lightgray", width=1.0, label="<10 games")

ax1.set_ylabel("Ratio")
ax1.set_title("Win / Loss / Draw Ratios by Possession (20–80%, only ≥10 games)")
ax1.legend(title="Result")

tick_positions = [i for i, val in enumerate(df.index) if val % 5 == 0]
tick_labels = [val for val in df.index if val % 5 == 0]
ax1.set_xticks(tick_positions)
ax1.set_xticklabels(tick_labels)
ax1.set_xlim(-0.5, len(df.index) - 0.5)
ax1.tick_params(labelbottom=True)

ax2.bar(x_positions, df["total_games"].fillna(0), color="steelblue", width=1.0)
ax2.set_ylabel("# Games")
ax2.set_xlabel("Possession (%)")
ax2.set_xticks(tick_positions)
ax2.set_xticklabels(tick_labels)
ax2.set_xlim(-0.5, len(df.index) - 0.5)

plt.tight_layout()
plt.show()