stackoverflow2025/processing.py at main · JavierPachas/stackoverflow2025 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from pathlib import Path
import numpy as np
import pandas as pd

DATA_PATH = Path("data/survey_results_public.csv")
SALARY_RANGE = (12000, 500000)

COUNTRY_RENAMES = {
    "United States of America": "USA",
    "United Kingdom of Great Britain and Northern Ireland": "UK/NorthIreland",
    "Iran, Islamic Republic of...": "Iran",
    "Venezuela, Bolivarian Republic of...": "Venezuela",
    "Hong Kong (S.A.R.)": "Hong Kong",
}


def clean_education(value):
    """Normalize education strings to a small set of labels."""
    text = str(value) if pd.notna(value) else ""
    if "Bachelor" in text:
        return "Bachelors"
    if "Master" in text:
        return "Masters"
    if "Professional degree" in text or "Doctoral" in text:
        return "PostGrad"
    return "LessThanBachelors"


def clean_years(value):
    """Convert survey experience answers to a numeric year count."""
    if pd.isna(value):
        return np.nan
    if isinstance(value, (int, float)):
        years = float(value)
        return years
    text = str(value).strip()
    if text == "Less than 1 year":
        return 0.5
    if text == "More than 50 years":
        return 50.0
    try:
        years = float(text)
        return years
    except ValueError:
        return np.nan


def normalize_inputs(frame: pd.DataFrame) -> pd.DataFrame:
    """Apply the same cleaning used during model training."""
    df_norm = frame.copy()
    df_norm["Country"] = df_norm["Country"].replace(COUNTRY_RENAMES)
    df_norm["EdLevel"] = df_norm["EdLevel"].apply(clean_education)
    df_norm["YearsCode"] = df_norm["YearsCode"].apply(clean_years)
    df_norm["YearsCode"] = df_norm["YearsCode"].clip(lower=0, upper=50)
    return df_norm


def load_survey_data(path: Path = DATA_PATH, cutoff: int = 100) -> pd.DataFrame:
    """
    Load and lightly filter the Stack Overflow survey responses.

    Keeping this here keeps cleaning logic in one place for both the explorer
    and the predictor.
    """
    df = pd.read_csv(
        path,
        usecols=["Country", "EdLevel", "YearsCode", "Employment", "ConvertedCompYearly"],
        low_memory=False,
    )
    df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)
    df = df[df["Employment"] == "Employed"].drop(columns=["Employment"])
    df = df[df["Salary"].notnull()]

    df = normalize_inputs(df).dropna()
    df = df[df["YearsCode"].between(0, 50)]

    counts = df["Country"].value_counts()
    top_countries = counts[counts >= cutoff].index
    df = df[df["Country"].isin(top_countries)]

    low, high = SALARY_RANGE
    df = df[df["Salary"].between(low, high)]

    return df