This commit is contained in:
2025-05-31 10:58:30 -03:00
commit 1cb7645910
48 changed files with 2235 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
using CsvHelper.Configuration;
using OpenCand.Parser.Models;
using System.Globalization;
namespace OpenCand.Parser.CsvMappers
{
public class BemCandidatoMap : ClassMap<BemCandidatoCSV>
{
public BemCandidatoMap()
{
AutoMap(CultureInfo.InvariantCulture);
// Explicitly handle any special mappings if needed
}
}
}

View File

@@ -0,0 +1,15 @@
using CsvHelper.Configuration;
using OpenCand.Parser.Models;
using System.Globalization;
namespace OpenCand.Parser.CsvMappers
{
public class CandidatoMap : ClassMap<CandidatoCSV>
{
public CandidatoMap()
{
AutoMap(CultureInfo.InvariantCulture);
// Explicitly handle any special mappings if needed
}
}
}

View File

@@ -0,0 +1,14 @@
using System.Globalization;
using CsvHelper.Configuration;
using OpenCand.Parser.Models;
namespace OpenCand.ETL.Parser.CsvMappers
{
public class RedeSocialMap : ClassMap<BemCandidatoCSV>
{
public RedeSocialMap()
{
AutoMap(CultureInfo.InvariantCulture);
}
}
}

View File

@@ -0,0 +1,58 @@
using CsvHelper.Configuration.Attributes;
namespace OpenCand.Parser.Models
{
public class BemCandidatoCSV
{
[Name("DT_GERACAO")]
public string DataGeracao { get; set; }
[Name("HH_GERACAO")]
public string HoraGeracao { get; set; }
[Name("ANO_ELEICAO")]
public int AnoEleicao { get; set; }
[Name("CD_TIPO_ELEICAO")]
public int CodigoTipoEleicao { get; set; }
[Name("NM_TIPO_ELEICAO")]
public string NomeTipoEleicao { get; set; }
[Name("CD_ELEICAO")]
public int CodigoEleicao { get; set; }
[Name("DS_ELEICAO")]
public string DescricaoEleicao { get; set; }
[Name("DT_ELEICAO")]
public string DataEleicao { get; set; }
[Name("SG_UF")]
public string SiglaUF { get; set; }
[Name("SG_UE")]
public string SiglaUE { get; set; }
[Name("NM_UE")]
public string NomeUE { get; set; }
[Name("SQ_CANDIDATO")]
public string SequencialCandidato { get; set; }
[Name("NR_ORDEM_BEM_CANDIDATO")]
public int NumeroOrdemBemCandidato { get; set; }
[Name("CD_TIPO_BEM_CANDIDATO")]
public int CodigoTipoBemCandidato { get; set; }
[Name("DS_TIPO_BEM_CANDIDATO")]
public string DescricaoTipoBemCandidato { get; set; }
[Name("DS_BEM_CANDIDATO")]
public string DescricaoBemCandidato { get; set; }
[Name("VR_BEM_CANDIDATO")]
public string ValorBemCandidato { get; set; }
}
}

View File

@@ -0,0 +1,95 @@
using System;
using CsvHelper.Configuration.Attributes;
namespace OpenCand.Parser.Models
{
public class CandidatoCSV
{
[Name("DT_GERACAO")]
public string DataGeracao { get; set; }
[Name("HH_GERACAO")]
public string HoraGeracao { get; set; }
[Name("ANO_ELEICAO")]
public int AnoEleicao { get; set; }
[Name("CD_TIPO_ELEICAO")]
public int CodigoTipoEleicao { get; set; }
[Name("NM_TIPO_ELEICAO")]
public string NomeTipoEleicao { get; set; }
[Name("NR_TURNO")]
public int NumeroTurno { get; set; }
[Name("CD_ELEICAO")]
public int CodigoEleicao { get; set; }
[Name("DS_ELEICAO")]
public string DescricaoEleicao { get; set; }
[Name("DT_ELEICAO")]
public string DataEleicao { get; set; }
[Name("TP_ABRANGENCIA")]
public string TipoAbrangencia { get; set; }
[Name("SG_UF")]
public string SiglaUF { get; set; }
[Name("SG_UE")]
public string SiglaUE { get; set; }
[Name("NM_UE")]
public string NomeUE { get; set; }
[Name("CD_CARGO")]
public int CodigoCargo { get; set; }
[Name("DS_CARGO")]
public string DescricaoCargo { get; set; }
[Name("SQ_CANDIDATO")]
public string SequencialCandidato { get; set; }
[Name("NR_CANDIDATO")]
public string NumeroCandidato { get; set; }
[Name("NM_CANDIDATO")]
public string NomeCandidato { get; set; }
[Name("NM_URNA_CANDIDATO")]
public string NomeUrnaCandidato { get; set; }
[Name("NM_SOCIAL_CANDIDATO")]
public string NomeSocialCandidato { get; set; }
[Name("NR_CPF_CANDIDATO")]
public string CPFCandidato { get; set; }
[Name("DS_EMAIL", "NM_EMAIL")]
public string Email { get; set; }
[Name("SG_UF_NASCIMENTO")]
public string SiglaUFNascimento { get; set; }
[Name("DT_NASCIMENTO")]
public string DataNascimento { get; set; }
[Name("DS_GENERO")]
public string Genero { get; set; }
[Name("DS_OCUPACAO")]
public string Ocupacao { get; set; }
[Name("DS_ESTADO_CIVIL")]
public string EstadoCivil { get; set; }
[Name("DS_GRAU_INSTRUCAO")]
public string GrauInstrucao { get; set; }
[Name("DS_SIT_TOT_TURNO")]
public string SituacaoTurno { get; set; }
}
}

View File

@@ -0,0 +1,19 @@
using CsvHelper.Configuration.Attributes;
namespace OpenCand.Parser.Models
{
public class RedeSocialCSV
{
[Name("AA_ELEICAO")]
public int DataEleicao { get; set; }
[Name("SG_UF")]
public string SiglaUF { get; set; }
[Name("SQ_CANDIDATO")]
public string SequencialCandidato { get; set; }
[Name("DS_URL")]
public string Url { get; set; }
}
}

View File

@@ -0,0 +1,118 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using OpenCand.Config;
using OpenCand.Parser.Services;
namespace OpenCand.Parser
{
public class ParserManager
{
private readonly CsvParserService csvParserService;
private readonly ILogger<ParserManager> logger;
private readonly CsvSettings csvSettings;
private readonly IConfiguration configuration;
public ParserManager(
CsvParserService csvParserService,
IOptions<CsvSettings> csvSettings,
ILogger<ParserManager> logger,
IConfiguration configuration)
{
this.csvParserService = csvParserService;
this.logger = logger;
this.csvSettings = csvSettings.Value;
this.configuration = configuration;
}
public async Task ParseFullDataAsync()
{
logger.LogInformation("ParseFullDataAsync - Starting parsing");
// Get the base path from either SampleFolder in csvSettings or the BasePath in configuration
var basePath = configuration.GetValue<string>("BasePath");
if (string.IsNullOrEmpty(basePath))
{
logger.LogError("ParseFullDataAsync - BasePath is not configured in appsettings.json or CsvSettings.SampleFolder");
return;
}
logger.LogInformation("ParseFullDataAsync - Processing will happen with BasePath: {BasePath}", basePath);
try
{
var candidatosDirectory = Path.Combine(basePath, csvSettings.CandidatosFolder);
var bensCandidatosDirectory = Path.Combine(basePath, csvSettings.BensCandidatosFolder);
var redesSociaisDirectory = Path.Combine(basePath, csvSettings.RedesSociaisFolder);
if (Directory.Exists(candidatosDirectory))
{
foreach (var filePath in Directory.GetFiles(candidatosDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFullDataAsync - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFullDataAsync - Parsing candidatos data from {FilePath}", filePath);
await csvParserService.ParseCandidatosAsync(filePath);
}
}
else
{
logger.LogWarning("ParseFullDataAsync - 'Candidatos' directory not found at {Directory}", candidatosDirectory);
}
if (Directory.Exists(bensCandidatosDirectory))
{
foreach (var filePath in Directory.GetFiles(bensCandidatosDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFullDataAsync - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFullDataAsync - Parsing bens candidatos data from {FilePath}", filePath);
await csvParserService.ParseBensCandidatosAsync(filePath);
}
}
else
{
logger.LogWarning("ParseFullDataAsync - 'Bens candidatos' directory not found at {Directory}", bensCandidatosDirectory);
}
if (Directory.Exists(redesSociaisDirectory))
{
foreach (var filePath in Directory.GetFiles(redesSociaisDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFullDataAsync - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFullDataAsync - Parsing redes sociais data from {FilePath}", filePath);
await csvParserService.ParseRedeSocialAsync(filePath);
}
}
else
{
logger.LogWarning("ParseFullDataAsync - 'Redes sociais' directory not found at {Directory}", redesSociaisDirectory);
}
logger.LogInformation("ParseFullDataAsync - Full data parsing completed!");
}
catch (Exception ex)
{
logger.LogError(ex, "ParseFullDataAsync - Error parsing full data set");
throw;
}
}
}
}

View File

@@ -0,0 +1,119 @@
using System.Text;
using Microsoft.Extensions.Logging;
using OpenCand.Repository;
namespace OpenCand.Parser.Services
{
public class CsvFixerService
{
private readonly ILogger<CsvParserService> logger;
public CsvFixerService(
ILogger<CsvParserService> logger)
{
this.logger = logger;
}
public string FixCsvFile(string filePath)
{
var filename = Path.GetFileName(filePath);
var path = Path.GetDirectoryName(filePath);
// Check if the file exists
if (!File.Exists(filePath))
{
logger.LogError($"FixCsvFile - The file at '{filePath}' does not exist");
return string.Empty;
}
if (string.IsNullOrEmpty(filename) || string.IsNullOrEmpty(path))
{
logger.LogError($"FixCsvFile - The file path '{filePath}' is invalid");
return string.Empty;
}
// Fixed file will have the same name but with "fix_" prefix
var newFilePath = Path.Combine(path, $"fix_{filename}");
if (File.Exists(newFilePath))
{
logger.LogWarning($"FixCsvFile - A fixed file already exists at '{newFilePath}'. It will be overwritten.");
}
logger.LogInformation($"FixCsvFile - Starting to fix CSV file at '{filePath}'");
try
{
// Read the file
var lines = File.ReadAllLines(filePath, encoding: Encoding.GetEncoding(1252));
if (lines.Length == 0)
{
logger.LogError($"FixCsvFile - The file at '{filePath}' is empty");
return string.Empty;
}
var newLines = new List<string>();
var headerCount = lines[0].Split(';').Length;
if (headerCount == 0)
{
logger.LogError($"FixCsvFile - The first line of the file at '{filePath}' does not contain any headers");
return string.Empty;
}
logger.LogInformation($"FixCsvFile - Detected {headerCount} headers in the CSV file");
for (int i = 0; i < lines.Length;)
{
var line = lines[i];
var columns = line.Split(';');
var lineJump = 1;
while (columns.Length != headerCount)
{
if (columns.Length > headerCount)
{
logger.LogCritical($"FixCsvFile - Line {i + 1} has {columns.Length} columns, expected {headerCount}. Halting process.");
return string.Empty; // Critical error, cannot fix this line => needs manual intervention
}
logger.LogWarning($"FixCsvFile - Line {i + 1} has {columns.Length} columns, expected {headerCount}. Attempting to fix [i = {lineJump}]...");
// Likely the "original line" had some \n that were processed incorrectly
// Append lines[i + 1] to the current line and re-do the check
if (i + lineJump >= lines.Length)
{
logger.LogCritical($"FixCsvFile - Reached the end of the file while trying to fix line {i + 1}. Cannot continue.");
return string.Empty; // Cannot fix this line, reached the end of the file
}
// Append the next line to the current line
line += lines[i + lineJump];
// Re-split the line to check the number of columns again
columns = line.Split(';');
// increment lineJump
lineJump++;
}
newLines.Add(line);
i += lineJump;
}
// Write the fixed lines to the new filepath
File.WriteAllLines(newFilePath, newLines, Encoding.UTF8);
logger.LogInformation($"FixCsvFile - Successfully fixed CSV file at {newFilePath}");
return newFilePath;
}
catch (Exception ex)
{
logger.LogError(ex, $"FixCsvFile - Error fixing CSV file at {filePath}");
return string.Empty;
}
}
}
}

View File

@@ -0,0 +1,268 @@
using System.Globalization;
using CsvHelper;
using CsvHelper.Configuration;
using Microsoft.Extensions.Logging;
using OpenCand.Core.Models;
using OpenCand.ETL.Parser.CsvMappers;
using OpenCand.Parser.CsvMappers;
using OpenCand.Parser.Models;
using OpenCand.Services;
namespace OpenCand.Parser.Services
{
public class CsvParserService
{
private readonly ILogger<CsvParserService> logger;
private readonly CandidatoService candidatoService;
private readonly BemCandidatoService bemCandidatoService;
private readonly RedeSocialService redeSocialService;
private readonly CsvFixerService csvFixerService;
private readonly CsvConfiguration parserConfig;
public CsvParserService(
ILogger<CsvParserService> logger,
CandidatoService candidatoService,
BemCandidatoService bemCandidatoService,
RedeSocialService redeSocialService,
CsvFixerService csvFixerService)
{
this.logger = logger;
this.candidatoService = candidatoService;
this.bemCandidatoService = bemCandidatoService;
this.redeSocialService = redeSocialService;
this.csvFixerService = csvFixerService;
parserConfig = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";",
HasHeaderRecord = true,
PrepareHeaderForMatch = args => args.Header.ToLower(),
MissingFieldFound = null,
TrimOptions = TrimOptions.Trim,
Encoding = System.Text.Encoding.UTF8
};
}
public async Task ParseCandidatosAsync(string filePath)
{
logger.LogInformation($"ParseCandidatosAsync - Starting to parse 'candidatos' from '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseCandidatosAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
var po = new ParallelOptions
{
MaxDegreeOfParallelism = 100
};
csv.Context.RegisterClassMap<CandidatoMap>();
var records = csv.GetRecords<CandidatoCSV>();
await Parallel.ForEachAsync(records, po, async (record, ct) =>
{
try
{
if (string.IsNullOrWhiteSpace(record.CPFCandidato) || record.CPFCandidato.Length <= 3)
{
record.CPFCandidato = null; // Handle null/empty/whitespace CPF
}
if (record.NomeCandidato == "NÃO DIVULGÁVEL" ||
string.IsNullOrEmpty(record.NomeCandidato) ||
record.NomeCandidato == "#NULO")
{
logger.LogCritical($"ParseCandidatosAsync - Candidate with id {record.SequencialCandidato} with invalid name, skipping...");
return; // Skip candidates with invalid name
}
var candidato = new Candidato
{
Cpf = record.CPFCandidato,
SqCandidato = record.SequencialCandidato,
Nome = record.NomeCandidato,
Email = record.Email.Contains("@") ? record.Email : null,
Sexo = record.Genero,
EstadoCivil = record.EstadoCivil,
Escolaridade = record.GrauInstrucao,
Ocupacao = record.Ocupacao,
Eleicoes = new List<CandidatoMapping>()
{
new CandidatoMapping
{
Cpf = record.CPFCandidato,
Nome = record.NomeCandidato,
SqCandidato = record.SequencialCandidato,
Ano = record.AnoEleicao,
TipoEleicao = record.TipoAbrangencia,
NomeUE = record.NomeUE,
SiglaUF = record.SiglaUF,
Cargo = record.DescricaoCargo,
NrCandidato = record.NumeroCandidato,
Resultado = record.SituacaoTurno,
}
}
};
if (!string.IsNullOrEmpty(record.DataNascimento) &&
record.DataNascimento != "#NULO")
{
if (DateTime.TryParseExact(record.DataNascimento, "dd/MM/yyyy",
CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out var dataNascimento))
{
// Convert to UTC DateTime to work with PostgreSQL timestamp with time zone
candidato.DataNascimento = DateTime.SpecifyKind(dataNascimento, DateTimeKind.Utc);
}
}
else
{
candidato.DataNascimento = null; // Handle null/empty/whitespace date
}
await candidatoService.AddCandidatoAsync(candidato);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseCandidatosAsync - Error processing candidate with id {CandidatoId}", record.SequencialCandidato);
}
});
logger.LogInformation("ParseCandidatosAsync - Finished parsing candidatos from {FilePath}", filePath);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseCandidatosAsync - Error parsing candidatos file {FilePath}", filePath);
throw;
}
}
public async Task ParseBensCandidatosAsync(string filePath)
{
logger.LogInformation($"ParseBensCandidatosAsync - Starting to parse bens candidatos from '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseBensCandidatosAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
csv.Context.RegisterClassMap<BemCandidatoMap>();
var records = csv.GetRecords<BemCandidatoCSV>();
foreach (var record in records)
{
try
{
// Parse decimal value
decimal? valor = null;
if (!string.IsNullOrEmpty(record.ValorBemCandidato))
{
string normalizedValue = record.ValorBemCandidato.Replace(".", "").Replace(",", ".");
if (decimal.TryParse(normalizedValue, NumberStyles.Any, CultureInfo.InvariantCulture, out var parsedValue))
{
valor = parsedValue;
}
}
var bemCandidato = new BemCandidato
{
SqCandidato = record.SequencialCandidato,
Ano = record.AnoEleicao,
SiglaUF = record.SiglaUF,
NomeUE = record.NomeUE,
OrdemBem = record.NumeroOrdemBemCandidato,
TipoBem = record.DescricaoTipoBemCandidato,
Descricao = record.DescricaoBemCandidato,
Valor = valor
};
await bemCandidatoService.AddBemCandidatoAsync(bemCandidato);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseBensCandidatosAsync - Error processing bem candidato with id {CandidatoId} and ordem {OrdemBem}",
record.SequencialCandidato, record.NumeroOrdemBemCandidato);
}
}
logger.LogInformation("ParseBensCandidatosAsync - Finished parsing bens candidatos from {FilePath}", filePath);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseBensCandidatosAsync - Error parsing bens candidatos file {FilePath}", filePath);
throw;
}
}
public async Task ParseRedeSocialAsync(string filePath)
{
logger.LogInformation($"ParseRedeSocialAsync - Starting to parse redes sociais from '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseRedeSocialAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
csv.Context.RegisterClassMap<RedeSocialMap>();
var records = csv.GetRecords<RedeSocialCSV>();
foreach (var record in records)
{
try
{
var redeSocial = new RedeSocial
{
SqCandidato = record.SequencialCandidato,
Ano = record.DataEleicao,
SiglaUF = record.SiglaUF,
Link = record.Url,
Rede = string.Empty
};
await redeSocialService.AddRedeSocialAsync(redeSocial);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseRedeSocialAsync - Error processing redes sociais with id {SequencialCandidato} and link {Url}",
record.SequencialCandidato, record.Url);
}
}
logger.LogInformation("ParseRedeSocialAsync - Finished parsing redes sociais from {FilePath}", filePath);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseRedeSocialAsync - Error parsing redes sociais file {FilePath}", filePath);
throw;
}
}
}
}