diff --git a/statvar_imports/statistics_poland/README.md b/statvar_imports/statistics_poland/README.md index 5df3b7fe82..50c70ef17a 100644 --- a/statvar_imports/statistics_poland/README.md +++ b/statvar_imports/statistics_poland/README.md @@ -11,7 +11,7 @@ https://stat.gov.pl/en/databases/ The data comes from Poland's official statistical authority and includes comprehensive demographic variables such as population counts, age distributions, and other census-related metrics. ## How To Download Input Data -To download the data, you'll need to use the provided download script download_input_data.py. This script processes the statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx file to generate StatisticsPoland_input.csv inside a new "poland_input" folder. +To download the data, you'll need to use the provided download script download_input_data.py. This script processes the StatisticsPoland_input.csv file available in Bucket with path datcom-prod-imports/statvar_imports/statistics_poland/poland_data_sample to generate StatisticsPoland_input_*.csv inside a new "source_files" folder. type of place: State. @@ -29,18 +29,18 @@ To process the Poland Census data and generate statistical variables, use the fo **For Test Data Run** ```bash python3 tools/statvar_importer/stat_var_processor.py \ - --input_data='statvar_imports/statistics_poland/test/StatisticsPoland_input.csv' \ - --pv_map='statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv' \ - --output_path='statvar_imports/statistics_poland/test/StatisticsPoland_output' \ - --config_file='statvar_imports/statistics_poland/Statistics_Poland_metadata.csv' \ - --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf + --input_data=statvar_imports/statistics_poland/test/StatisticsPoland_input.csv \ + --pv_map=statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv \ + --output_path=statvar_imports/statistics_poland/test/StatisticsPoland_output \ + --config_file=statvar_imports/statistics_poland/StatisticsPoland_metadata.csv \ + --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf ``` **For Main data run** ```bash python3 tools/statvar_importer/stat_var_processor.py \ - --input_data='statvar_imports/statistics_poland/StatisticsPoland_input.csv' \ - --pv_map='statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv' \ - --output_path='statvar_imports/statistics_poland/StatisticsPoland_output' \ - --config_file='statvar_imports/statistics_poland/Statistics_Poland_metadata.csv' \ - --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf + --input_data='statvar_imports/statistics_poland/source_files/*.csv' \ + --pv_map=statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv \ + --output_path=statvar_imports/statistics_poland/StatisticsPoland_output \ + --config_file=statvar_imports/statistics_poland/StatisticsPoland_metadata.csv \ + --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf ``` diff --git a/statvar_imports/statistics_poland/Statistics_Poland_metadata.csv b/statvar_imports/statistics_poland/StatisticsPoland_metadata.csv similarity index 100% rename from statvar_imports/statistics_poland/Statistics_Poland_metadata.csv rename to statvar_imports/statistics_poland/StatisticsPoland_metadata.csv diff --git a/statvar_imports/statistics_poland/StatisticsPoland_output_stat_vars.mcf b/statvar_imports/statistics_poland/StatisticsPoland_output_stat_vars.mcf deleted file mode 100644 index aa303b0cd9..0000000000 --- a/statvar_imports/statistics_poland/StatisticsPoland_output_stat_vars.mcf +++ /dev/null @@ -1,827 +0,0 @@ -# Auto generated using command: "/usr/local/google/home/abhishekjaisw/Desktop/data/statvar_imports/statistics_poland/../../tools/statvar_importer/stat_var_processor.py --input_data=poland_input/StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=poland_output/StatisticsPoland_output" on 2026-01-29 13:18:54.678653 - -Node: dcid:Count_Person_Years0To2 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 - -Node: dcid:Count_Person_Years0To2_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -gender: dcid:Female - -Node: dcid:Count_Person_Years0To2_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years0To2_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years0To2_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -gender: dcid:Male - -Node: dcid:Count_Person_Years0To2_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years0To2_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years0To2_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years0To2_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years0To2 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years13To15 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 - -Node: dcid:Count_Person_Years13To15_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -gender: dcid:Female - -Node: dcid:Count_Person_Years13To15_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years13To15_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years13To15_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -gender: dcid:Male - -Node: dcid:Count_Person_Years13To15_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years13To15_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years13To15_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years13To15_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years13To15 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years16To19 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 - -Node: dcid:Count_Person_Years16To19_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -gender: dcid:Female - -Node: dcid:Count_Person_Years16To19_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years16To19_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years16To19_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -gender: dcid:Male - -Node: dcid:Count_Person_Years16To19_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years16To19_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years16To19_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years16To19_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years16To19 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years20To24 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 - -Node: dcid:Count_Person_Years20To24_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -gender: dcid:Female - -Node: dcid:Count_Person_Years20To24_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years20To24_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years20To24_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -gender: dcid:Male - -Node: dcid:Count_Person_Years20To24_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years20To24_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years20To24_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years20To24_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years20To24 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years25To34 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 - -Node: dcid:Count_Person_Years25To34_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -gender: dcid:Female - -Node: dcid:Count_Person_Years25To34_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years25To34_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years25To34_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -gender: dcid:Male - -Node: dcid:Count_Person_Years25To34_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years25To34_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years25To34_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years25To34_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years25To34 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years35To44 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 - -Node: dcid:Count_Person_Years35To44_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -gender: dcid:Female - -Node: dcid:Count_Person_Years35To44_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years35To44_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years35To44_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -gender: dcid:Male - -Node: dcid:Count_Person_Years35To44_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years35To44_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years35To44_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years35To44_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years35To44 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years3To6 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 - -Node: dcid:Count_Person_Years3To6_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -gender: dcid:Female - -Node: dcid:Count_Person_Years3To6_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years3To6_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years3To6_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -gender: dcid:Male - -Node: dcid:Count_Person_Years3To6_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years3To6_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years3To6_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years3To6_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years3To6 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years45To54 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 - -Node: dcid:Count_Person_Years45To54_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -gender: dcid:Female - -Node: dcid:Count_Person_Years45To54_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years45To54_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years45To54_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -gender: dcid:Male - -Node: dcid:Count_Person_Years45To54_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years45To54_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years45To54_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years45To54_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years45To54 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years55To64 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 - -Node: dcid:Count_Person_Years55To64_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -gender: dcid:Female - -Node: dcid:Count_Person_Years55To64_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years55To64_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years55To64_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -gender: dcid:Male - -Node: dcid:Count_Person_Years55To64_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years55To64_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years55To64_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years55To64_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years55To64 -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years65Onwards -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards - -Node: dcid:Count_Person_Years65Onwards_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -gender: dcid:Female - -Node: dcid:Count_Person_Years65Onwards_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years65Onwards_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years65Onwards_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -gender: dcid:Male - -Node: dcid:Count_Person_Years65Onwards_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years65Onwards_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years65Onwards_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years65Onwards_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years65Onwards -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years7To12 -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 - -Node: dcid:Count_Person_Years7To12_Female -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -gender: dcid:Female - -Node: dcid:Count_Person_Years7To12_Female_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -gender: dcid:Female -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years7To12_Female_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -gender: dcid:Female -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years7To12_Male -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -gender: dcid:Male - -Node: dcid:Count_Person_Years7To12_Male_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -gender: dcid:Male -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years7To12_Male_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -gender: dcid:Male -placeOfResidenceClassification: dcid:Urban - -Node: dcid:Count_Person_Years7To12_Rural -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -placeOfResidenceClassification: dcid:Rural - -Node: dcid:Count_Person_Years7To12_Urban -typeOf: dcid:StatisticalVariable -populationType: dcid:Person -measuredProperty: dcid:count -statType: dcid:measuredValue -age: dcid:Years7To12 -placeOfResidenceClassification: dcid:Urban - diff --git a/statvar_imports/statistics_poland/download_input_data.py b/statvar_imports/statistics_poland/download_input_data.py index ee9135f854..147335540a 100644 --- a/statvar_imports/statistics_poland/download_input_data.py +++ b/statvar_imports/statistics_poland/download_input_data.py @@ -1,92 +1,275 @@ import pandas as pd import os import logging -import sys -import subprocess +import requests +import time from datetime import datetime +from google.cloud import storage +import io # Configure logging logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # --- CONFIGURATION --- BASE_PATH = os.path.dirname(os.path.abspath(__file__)) -# Local temporary path for the downloaded Excel -LOCAL_EXCEL = os.path.join(BASE_PATH, "poland_raw.xlsx") -# Source GCS path -GCS_EXCEL_PATH = "gs://datcom-prod-imports/statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx" -# Final CSV in the root directory for the processor -OUTPUT_FILE = os.path.join(BASE_PATH, "StatisticsPoland_input.csv") - -TARGET_AGES = [ - "0-2", "3-6", "7-12", "13-15", "16-19", "20-24", - "25-34", "35-44", "45-54", "55-64", "65 i więcej" -] - -def download_from_gcs(): - """Downloads the raw excel file from GCS using gsutil.""" + +# GCS Template Path +GCS_TEMPLATE_PATH = "gs://datcom-prod-imports/statvar_imports/statistics_poland/poland_data_sample/StatisticsPoland_input.csv" + +# Local Output Directory +OUTPUT_DIR = os.path.join(BASE_PATH, "source_files") + +API_BASE_URL = "https://bdl.stat.gov.pl/api/v1" +API_KEY = "c9a9da02-47ab-4391-dff1-08de66e5ba7b" +HEADERS = {'X-ClientId': API_KEY} + +SUBJECT_ID = "P3447" + +SEX_STEMS = { + 'total': [], + 'males': ['męż'], + 'females': ['kob'] +} +LOC_STEMS = { + 'total': [], + 'in urban areas': ['miast'], + 'in rural areas': ['wsi', 'wieś'] +} + +# AGE STEMS +AGE_STEMS = { + '0-2': '0-2', '3-6': '3-6', '7-12': '7-12', '13-15': '13-15', + '16-19': '16-19', '20-24': '20-24', '25-34': '25-34', '35-44': '35-44', + '45-54': '45-54', '55-64': '55-64', '65 and more': '65' +} + +def load_template_from_gcs(gcs_path): + """Loads the template CSV directly from GCS.""" try: - logging.info(f"Downloading source from {GCS_EXCEL_PATH}...") - subprocess.check_call(['gsutil', 'cp', GCS_EXCEL_PATH, LOCAL_EXCEL]) + logging.info(f"Reading template from {gcs_path}...") + + # Method 1: Direct Pandas Read (requires gcsfs) + # return pd.read_csv(gcs_path, header=[0,1,2,3], index_col=[0,1]) + + # Method 2: Google Cloud Storage Client (More robust if gcsfs isn't configured) + storage_client = storage.Client() + + # Parse bucket and blob + path_parts = gcs_path.replace("gs://", "").split("/", 1) + bucket_name = path_parts[0] + blob_name = path_parts[1] + + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(blob_name) + content = blob.download_as_text() + + return pd.read_csv(io.StringIO(content), header=[0,1,2,3], index_col=[0,1]) + except Exception as e: - logging.error(f"Failed to download from GCS: {e}") - # Note: If running locally and file exists, you might want to skip exit - if not os.path.exists(LOCAL_EXCEL): - sys.exit(1) + logging.error(f"Failed to load template from GCS: {e}") + return None -def process_poland_pivot(): - # 1. Fetch data from cloud - download_from_gcs() +def get_template_map(template_df): + """Maps Region Name -> Code (as String).""" + name_to_code = {} + for code, name in template_df.index: + clean_name = str(name).strip().upper() + name_to_code[clean_name] = str(code).strip() + return name_to_code - logging.info(f"Processing data from local copy: {LOCAL_EXCEL}") +def fetch_variables(): + """Fetches all variables for Subject P3447.""" + logging.info(f"Downloading variable list for Subject {SUBJECT_ID}...") + v_map = {} + + # Check pages 0-10 to ensure we get ALL variables + for page in range(10): + url = f"{API_BASE_URL}/variables?subject-id={SUBJECT_ID}&page-size=100&lang=pl&page={page}" + try: + resp = requests.get(url, headers=HEADERS, timeout=20) + if resp.status_code != 200: break + data = resp.json() + results = data.get('results', []) + if not results: break + + for item in results: + full_name_parts = [str(v) for k, v in item.items() if k.startswith('n') and v] + full_name = " ".join(full_name_parts).lower() + v_map[str(item['id'])] = full_name + + if len(results) < 100: break + except Exception as e: + logging.error(f"Metadata error page {page}: {e}") + break + + logging.info(f"Indexed {len(v_map)} variables.") + return v_map - try: - # 2. Load the 'DANE' sheet - df = pd.read_excel(LOCAL_EXCEL, sheet_name='DANE') - df.columns = ['Code', 'Name', 'Age', 'Sex', 'Location', 'Year', 'Value', 'Unit', 'Attr'] - - # 3. Filtering and Year Logic - df = df[df['Age'].isin(TARGET_AGES)] - current_year = datetime.now().year - available_years = sorted([y for y in df['Year'].unique() if y <= current_year]) - df = df[df['Year'].isin(available_years)] - - # 4. Translation Logic - translations = { - 'mężczyźni': 'males', - 'kobiety': 'females', - 'ogółem': 'total', - 'w miastach': 'in urban areas', - 'na wsi': 'in rural areas', - 'POLSKA': 'POLAND', - '65 i więcej': '65 and more' - } - for col in ['Sex', 'Location', 'Name', 'Age']: - df[col] = df[col].replace(translations) - - # 5. Create Pivot Table - pivot_df = df.pivot_table( - index=['Code', 'Name'], - columns=['Age', 'Sex', 'Location', 'Year'], +def download_and_process(): + if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) + + # 1. LOAD TEMPLATE FROM GCS + template_df = load_template_from_gcs(GCS_TEMPLATE_PATH) + if template_df is None: return + + # Force index to strings + template_df.index = template_df.index.set_levels([ + template_df.index.levels[0].astype(str), + template_df.index.levels[1].astype(str) + ]) + + region_map = get_template_map(template_df) + v_metadata = fetch_variables() + if not v_metadata: return + + master_data = [] + unique_cols = template_df.columns.droplevel('Year').unique() + current_year = datetime.now().year + + # 2. MATCH & DOWNLOAD (Specific Ages Only) + for age, sex, loc in unique_cols: + # SKIP TOTALS HERE -> We will calculate them later! + if pd.isna(age) or str(age).strip() == '' or str(age).lower() == 'total': + continue + + target_age = AGE_STEMS.get(age, age) + sex_stems = SEX_STEMS[sex] + loc_stems = LOC_STEMS[loc] + + var_id = None + for vid, vname in v_metadata.items(): + name_no_space = vname.replace(" ", "") + target_age_no_space = target_age.replace(" ", "") + if target_age_no_space not in name_no_space: continue + + if sex_stems: + if not any(s in vname for s in sex_stems): continue + else: + if 'męż' in vname or 'kob' in vname: continue + + if loc_stems: + if not any(s in vname for s in loc_stems): continue + else: + if 'miast' in vname or 'wsi' in vname or 'wieś' in vname: continue + + var_id = vid + break + + if not var_id: + logging.warning(f"SKIPPING: {age}|{sex}|{loc}") + continue + + logging.info(f"MATCH: {age}|{sex}|{loc} -> ID {var_id}") + + # Download Loop + for lv in ["0", "2"]: + api_url = f"{API_BASE_URL}/data/by-variable/{var_id}" + params = [('unit-level', lv), ('page-size', '100')] + + for y in range(2003, current_year + 2): + params.append(('year', str(y))) + + try: + resp = requests.get(api_url, headers=HEADERS, params=params, timeout=20) + if resp.status_code != 200: continue + results = resp.json().get('results', []) + if not results: continue + + sample_res = results[0] + api_name_key = next((k for k in ['name', 'n', 'unitName'] if k in sample_res), None) + if not api_name_key: continue + + for res in results: + api_name = res[api_name_key].upper().strip() + if api_name == "POLSKA": api_name = "POLAND" + + matched_code = region_map.get(api_name) + matched_name = api_name + if not matched_code: + for t_name, t_code in region_map.items(): + if t_name in api_name: + matched_code = t_code + matched_name = t_name + break + + if matched_code is not None: + for val in res['values']: + master_data.append({ + 'Code': str(matched_code), + 'Name': matched_name, + 'Year': str(val['year']), + 'Value': val['val'], + 'Age': age, 'Sex': sex, 'Location': loc + }) + except Exception as e: + logging.error(f"Download Error on {var_id}: {e}") + time.sleep(0.05) + + if not master_data: + logging.error("No data collected.") + return + + # 3. PROCESS & CALCULATE TOTALS + full_df = pd.DataFrame(master_data) + + for year in sorted(full_df['Year'].unique()): + year_df = full_df[full_df['Year'] == year] + + # Pivot specific ages + pivot_df = year_df.pivot_table( + index=['Code', 'Name'], + columns=['Age', 'Sex', 'Location', 'Year'], values='Value' ) - - # 6. Format Geographic Codes (7-digit padding) - pivot_df.index = pivot_df.index.set_levels( - pivot_df.index.levels[0].astype(str).str.zfill(7), level=0 + + # Sum Age columns to create "Total" Age column + totals = pivot_df.groupby(level=['Sex', 'Location', 'Year'], axis=1).sum() + + # Map calculated totals to the 'Age' level (using 'total' label for now) + new_columns = pd.MultiIndex.from_tuples( + [('total', s, l, y) for s, l, y in totals.columns], + names=['Age', 'Sex', 'Location', 'Year'] ) - - # 7. Save result to root for the Cloud executor - pivot_df.to_csv(OUTPUT_FILE, encoding='utf-8') + totals.columns = new_columns - logging.info(f"SUCCESS: {OUTPUT_FILE} created in root directory.") - # Cleanup temporary Excel to keep the environment clean - if os.path.exists(LOCAL_EXCEL): - os.remove(LOCAL_EXCEL) + combined_df = pd.concat([pivot_df, totals], axis=1) - except Exception as e: - logging.error(f"Processing Error: {e}") - sys.exit(1) + # 4. REINDEX AGAINST TEMPLATE + # Construct expected columns based on template structure for THIS year + target_columns = [] + for col in template_df.columns: + t_age, t_sex, t_loc, _ = col + + # Map Template Age to Our Age + if pd.isna(t_age) or str(t_age).strip() == '': + lookup_age = 'total' + else: + lookup_age = t_age + + target_columns.append((lookup_age, t_sex, t_loc, str(year))) -if __name__ == "__main__": - process_poland_pivot() + # Reindex rows (Code/Name) + final_df = combined_df.reindex(template_df.index) + + # Reindex columns to match template order + try: + final_df = final_df[target_columns] + + # Restore original headers (e.g. putting back empty strings for Total Age) + final_headers = [] + for col in template_df.columns: + t_age, t_sex, t_loc, _ = col + final_headers.append((t_age, t_sex, t_loc, str(year))) + + final_df.columns = pd.MultiIndex.from_tuples(final_headers, names=['Age', 'Sex', 'Location', 'Year']) + + except KeyError as e: + logging.warning(f"Column alignment warning for {year}: {e}") + pass + + out_path = os.path.join(OUTPUT_DIR, f"StatisticsPoland_input_{year}.csv") + final_df.to_csv(out_path) + logging.info(f"Generated: {out_path}") +if __name__ == "__main__": + download_and_process() \ No newline at end of file diff --git a/statvar_imports/statistics_poland/manifest.json b/statvar_imports/statistics_poland/manifest.json index c46108c7d3..4d2b33ca93 100644 --- a/statvar_imports/statistics_poland/manifest.json +++ b/statvar_imports/statistics_poland/manifest.json @@ -9,10 +9,10 @@ "provenance_description": "Population data for demographic variables such as population counts, age distributions, and other census-related metrics in Poland", "scripts": [ "download_input_data.py", - "../../tools/statvar_importer/stat_var_processor.py --input_data=StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=StatisticsPoland_output" + "../../tools/statvar_importer/stat_var_processor.py --input_data=source_files/*.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=StatisticsPoland_metadata.csv --output_path=StatisticsPoland_output" ], "source_files": [ - "StatisticsPoland_input.csv" + "source_files/*.csv" ], "import_inputs": [ { diff --git a/statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx b/statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx deleted file mode 100644 index caed5eaa2e..0000000000 Binary files a/statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx and /dev/null differ diff --git a/statvar_imports/statistics_poland/test/StatisticsPoland_output_stat_vars.mcf b/statvar_imports/statistics_poland/test/StatisticsPoland_output_stat_vars.mcf index 46669712c1..e2037e9852 100644 --- a/statvar_imports/statistics_poland/test/StatisticsPoland_output_stat_vars.mcf +++ b/statvar_imports/statistics_poland/test/StatisticsPoland_output_stat_vars.mcf @@ -1,4 +1,4 @@ -# Auto generated using command: "tools/statvar_importer/stat_var_processor.py --input_data=statvar_imports/statistics_poland/test/StatisticsPoland_input.csv --pv_map=statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv --output_path=statvar_imports/statistics_poland/test/StatisticsPoland_output --config_file=statvar_imports/statistics_poland/Statistics_Poland_metadata.csv --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf" on 2026-01-28 18:01:16.474072 +# Auto generated using command: "tools/statvar_importer/stat_var_processor.py --input_data=statvar_imports/statistics_poland/test/StatisticsPoland_input.csv --pv_map=statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv --output_path=statvar_imports/statistics_poland/test/StatisticsPoland_output --config_file=statvar_imports/statistics_poland/StatisticsPoland_metadata.csv --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf" on 2026-02-09 15:52:21.072860 Node: dcid:Count_Person_Years0To2 typeOf: dcid:StatisticalVariable