-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessing.py
More file actions
84 lines (69 loc) · 2.56 KB
/
processing.py
File metadata and controls
84 lines (69 loc) · 2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from pathlib import Path
import numpy as np
import pandas as pd
DATA_PATH = Path("data/survey_results_public.csv")
SALARY_RANGE = (12000, 500000)
COUNTRY_RENAMES = {
"United States of America": "USA",
"United Kingdom of Great Britain and Northern Ireland": "UK/NorthIreland",
"Iran, Islamic Republic of...": "Iran",
"Venezuela, Bolivarian Republic of...": "Venezuela",
"Hong Kong (S.A.R.)": "Hong Kong",
}
def clean_education(value):
"""Normalize education strings to a small set of labels."""
text = str(value) if pd.notna(value) else ""
if "Bachelor" in text:
return "Bachelors"
if "Master" in text:
return "Masters"
if "Professional degree" in text or "Doctoral" in text:
return "PostGrad"
return "LessThanBachelors"
def clean_years(value):
"""Convert survey experience answers to a numeric year count."""
if pd.isna(value):
return np.nan
if isinstance(value, (int, float)):
years = float(value)
return years
text = str(value).strip()
if text == "Less than 1 year":
return 0.5
if text == "More than 50 years":
return 50.0
try:
years = float(text)
return years
except ValueError:
return np.nan
def normalize_inputs(frame: pd.DataFrame) -> pd.DataFrame:
"""Apply the same cleaning used during model training."""
df_norm = frame.copy()
df_norm["Country"] = df_norm["Country"].replace(COUNTRY_RENAMES)
df_norm["EdLevel"] = df_norm["EdLevel"].apply(clean_education)
df_norm["YearsCode"] = df_norm["YearsCode"].apply(clean_years)
df_norm["YearsCode"] = df_norm["YearsCode"].clip(lower=0, upper=50)
return df_norm
def load_survey_data(path: Path = DATA_PATH, cutoff: int = 100) -> pd.DataFrame:
"""
Load and lightly filter the Stack Overflow survey responses.
Keeping this here keeps cleaning logic in one place for both the explorer
and the predictor.
"""
df = pd.read_csv(
path,
usecols=["Country", "EdLevel", "YearsCode", "Employment", "ConvertedCompYearly"],
low_memory=False,
)
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)
df = df[df["Employment"] == "Employed"].drop(columns=["Employment"])
df = df[df["Salary"].notnull()]
df = normalize_inputs(df).dropna()
df = df[df["YearsCode"].between(0, 50)]
counts = df["Country"].value_counts()
top_countries = counts[counts >= cutoff].index
df = df[df["Country"].isin(top_countries)]
low, high = SALARY_RANGE
df = df[df["Salary"].between(low, high)]
return df