Compare commits

...

2 Commits

Author SHA1 Message Date
2660826a3f stuff and refactor
All checks were successful
API and ETL Build / build_etl (push) Successful in 8s
API and ETL Build / build_api (push) Successful in 9s
2025-06-03 16:27:39 -03:00
03b1f4f1d1 add apelido 2025-06-02 16:47:24 -03:00
16 changed files with 561 additions and 426 deletions

View File

@ -15,11 +15,25 @@ namespace OpenCand.Repository
using (var connection = new NpgsqlConnection(ConnectionString))
{
return (await connection.QueryAsync<Candidato>(@"
SELECT idcandidato, cpf, nome, datanascimento, email, sexo, estadocivil, escolaridade, ocupacao
SELECT *,
CASE
WHEN lower(apelido) = lower(@query) THEN 0 -- apelido Exact match (case-insensitive)
WHEN lower(apelido) LIKE lower(@query) || '%' THEN 1 -- apelido Starts with match (case-insensitive)
WHEN lower(apelido) LIKE '%' || lower(@query) THEN 2 -- apelido Contains anywhere match (case-insensitive)
WHEN lower(nome) = lower(@query) THEN 0 -- nome Exact match (case-insensitive)
WHEN lower(nome) LIKE lower(@query) || '%' THEN 1 -- nome Starts with match (case-insensitive)
WHEN lower(nome) LIKE '%' || lower(@query) THEN 2 -- nome Contains anywhere match (case-insensitive)
WHEN cpf = @query THEN 0 -- cpf Exact match for CPF
WHEN cpf LIKE @query || '%' THEN 1 -- cpf Starts with match for CPF
WHEN cpf LIKE '%' || @query THEN 2 -- cpf Contains anywhere match for CPF
ELSE 3
END AS name_rank
FROM candidato
WHERE nome ILIKE '%' || @query || '%' OR
cpf ILIKE '%' || @query || '%' OR
email ILIKE '%' || @query || '%'
WHERE apelido ILIKE '%' || @query || '%' OR
nome ILIKE '%' || @query || '%' OR
cpf ILIKE '%' || @query || '%'
ORDER BY name_rank,
length(nome) ASC
LIMIT 10;",
new { query })).AsList();
}

View File

@ -0,0 +1,33 @@
#!/bin/bash
cd ./fotos_cand
COUNT=0
shopt -s nocaseglob
# Loop through all folders
for dir in */; do
# Change into the directory
cd "$dir" || continue
# Loop over every “.jpeg” (or “.JPEG”):
for f in *.jpeg; do
# “${f%.[jJ][pP][eE][gG]}” strips off the .jpeg/.JPEG suffix
base="${f%.[jJ][pP][eE][gG]}"
newfile="${base}.jpg"
# If theres already a .jpg with the same “base,” decide what to do:
if [ -e "$newfile" ]; then
echo "Skipping $f$newfile (target exists)"
# you could `rm "$f"` or move it to a backup folder here if you prefer
else
mv -v "$f" "$newfile"
fi
done
# Change back to the parent directory
cd ..
done
shopt -u nocaseglob
# Print a message indicating completion
echo "Normalization complete. Processed $COUNT files."

View File

@ -12,6 +12,8 @@ namespace OpenCand.Core.Models
public string Nome { get; set; }
public string Apelido { get; set; }
public DateTime? DataNascimento { get; set; }
public string Email { get; set; }
@ -35,6 +37,7 @@ namespace OpenCand.Core.Models
public Guid IdCandidato { get; set; }
public string Cpf { get; set; }
public string Nome { get; set; }
public string Apelido { get; set; }
public string SqCandidato { get; set; }
public int Ano { get; set; }
public string TipoEleicao { get; set; }

View File

@ -0,0 +1,7 @@
namespace OpenCand.ETL.Contracts
{
public interface IParserService<CsvObj>
{
Task ParseObject(CsvObj record);
}
}

View File

@ -1,15 +1,14 @@
using System.Text;
using Microsoft.Extensions.Logging;
using OpenCand.Repository;
namespace OpenCand.Parser.Services
{
public class CsvFixerService
{
private readonly ILogger<CsvParserService> logger;
private readonly ILogger<CsvFixerService> logger;
public CsvFixerService(
ILogger<CsvParserService> logger)
ILogger<CsvFixerService> logger)
{
this.logger = logger;
}

View File

@ -0,0 +1,150 @@
using System.Globalization;
using CsvHelper;
using CsvHelper.Configuration;
using Microsoft.Extensions.Logging;
using OpenCand.ETL.Contracts;
namespace OpenCand.Parser.Services
{
public class CsvParserService<CsvObj> : IDisposable
{
private readonly ILogger<CsvParserService<CsvObj>> logger;
private readonly CsvFixerService csvFixerService;
private readonly IParserService<CsvObj> parserService;
private readonly CsvConfiguration parserConfig;
// Progress tracking fields
private long processedCount;
private long totalCount;
private string currentTask = string.Empty;
private Timer? progressTimer;
private readonly object progressLock = new object();
public CsvParserService(
ILogger<CsvParserService<CsvObj>> logger,
IParserService<CsvObj> parserService,
CsvFixerService csvFixerService)
{
this.logger = logger;
this.csvFixerService = csvFixerService;
this.parserService = parserService;
parserConfig = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";",
HasHeaderRecord = true,
PrepareHeaderForMatch = args => args.Header.ToLower(),
MissingFieldFound = null,
TrimOptions = TrimOptions.Trim,
Encoding = System.Text.Encoding.UTF8
};
}
public async Task ParseFolderAsync(string filePath)
{
logger.LogInformation($"ParseFolderAsync - Starting to parse '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseFolderAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
var po = new ParallelOptions
{
MaxDegreeOfParallelism = 40
};
//csv.Context.RegisterClassMap<ClassMap<CsvObj>>(); // optional for advanced mapping, not needed
var records = csv.GetRecords<CsvObj>().ToList();
StartProgressTracking($"Parsing {nameof(CsvObj)} - {Path.GetFileName(filePath)}", records.Count);
await Parallel.ForEachAsync(records, po, async (record, ct) =>
{
try
{
await parserService.ParseObject(record);
// Increment progress
IncrementProgress();
}
catch (Exception ex)
{
logger.LogError(ex, $"ParseFolderAsync - Error processing:");
IncrementProgress();
}
});
StopProgressTracking();
logger.LogInformation($"ParseFolderAsync - Finished parsing from {filePath}");
}
catch (Exception ex)
{
logger.LogError(ex, $"ParseFolderAsync - Error parsing file {filePath}");
throw;
}
}
// Progress tracking methods
private void StartProgressTracking(string taskName, long total)
{
lock (progressLock)
{
currentTask = taskName;
processedCount = 0;
totalCount = total;
progressTimer?.Dispose();
progressTimer = new Timer(LogProgress, null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
logger.LogInformation("Progress - Task: {Task}, Total: {Total}", currentTask, totalCount);
}
}
private void IncrementProgress()
{
Interlocked.Increment(ref processedCount);
}
private void StopProgressTracking()
{
lock (progressLock)
{
progressTimer?.Dispose();
progressTimer = null;
// Log final progress
var percentage = totalCount > 0 ? (double)processedCount / totalCount * 100 : 0;
logger.LogInformation("Progress - Task: {Task}, Processed: {Processed}, Total: {Total}, Progress: {Percentage:F2}%",
currentTask, processedCount, totalCount, percentage);
}
}
private void LogProgress(object? state)
{
lock (progressLock)
{
if (string.IsNullOrEmpty(currentTask)) return;
var percentage = totalCount > 0 ? (double)processedCount / totalCount * 100 : 0;
logger.LogInformation("Progress - Task: {Task}, Processed: {Processed}, Total: {Total}, Progress: {Percentage:F2}%",
currentTask, processedCount, totalCount, percentage);
}
}
public void Dispose()
{
progressTimer?.Dispose();
}
}
}

View File

@ -29,6 +29,9 @@ namespace OpenCand.Parser.Models
[Name("NM_CANDIDATO")]
public string NomeCandidato { get; set; }
[Name("NM_URNA_CANDIDATO")]
public string Apelido { get; set; }
[Name("NR_CPF_CANDIDATO")]
public string CPFCandidato { get; set; }

View File

@ -2,116 +2,81 @@ using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using OpenCand.Config;
using OpenCand.Parser.Models;
using OpenCand.Parser.Services;
namespace OpenCand.Parser
{
public class ParserManager
{
private readonly CsvParserService csvParserService;
private readonly ILogger<ParserManager> logger;
private readonly CsvSettings csvSettings;
private readonly IConfiguration configuration;
private readonly CsvParserService<CandidatoCSV> candidatoParserService;
private readonly CsvParserService<BemCandidatoCSV> bemCandidatoParserService;
private readonly CsvParserService<RedeSocialCSV> redeSocialParserService;
private readonly string BasePath;
public ParserManager(
CsvParserService csvParserService,
IOptions<CsvSettings> csvSettings,
ILogger<ParserManager> logger,
IConfiguration configuration)
IConfiguration configuration,
CsvParserService<CandidatoCSV> candidatoParserService,
CsvParserService<BemCandidatoCSV> bemCandidatoParserService,
CsvParserService<RedeSocialCSV> redeSocialParserService)
{
this.csvParserService = csvParserService;
this.logger = logger;
this.csvSettings = csvSettings.Value;
this.configuration = configuration;
this.candidatoParserService = candidatoParserService;
this.bemCandidatoParserService = bemCandidatoParserService;
this.redeSocialParserService = redeSocialParserService;
// Get the base path from either SampleFolder in csvSettings or the BasePath in configuration
BasePath = configuration.GetValue<string>("BasePath") ?? string.Empty;
if (string.IsNullOrEmpty(BasePath))
{
throw new Exception("ParseFullDataAsync - BasePath is not configured in appsettings.json or CsvSettings.SampleFolder");
}
}
public async Task ParseFullDataAsync()
{
logger.LogInformation("ParseFullDataAsync - Starting parsing");
logger.LogInformation("ParseFullDataAsync - Processing will happen with BasePath: {BasePath}", BasePath);
// Get the base path from either SampleFolder in csvSettings or the BasePath in configuration
var basePath = configuration.GetValue<string>("BasePath");
var candidatosDirectory = Path.Combine(BasePath, csvSettings.CandidatosFolder);
var bensCandidatosDirectory = Path.Combine(BasePath, csvSettings.BensCandidatosFolder);
var redesSociaisDirectory = Path.Combine(BasePath, csvSettings.RedesSociaisFolder);
if (string.IsNullOrEmpty(basePath))
//await ParseFolder(candidatosDirectory, candidatoParserService);
await ParseFolder(bensCandidatosDirectory, bemCandidatoParserService);
await ParseFolder(redesSociaisDirectory, redeSocialParserService);
logger.LogInformation("ParseFullDataAsync - Full data parsing completed!");
}
private async Task ParseFolder<CsvObj>(string csvDirectory, CsvParserService<CsvObj> csvParserService)
{
if (Directory.Exists(csvDirectory))
{
logger.LogError("ParseFullDataAsync - BasePath is not configured in appsettings.json or CsvSettings.SampleFolder");
return;
foreach (var filePath in Directory.GetFiles(csvDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFolder - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFolder - Parsing data from {FilePath}", filePath);
await csvParserService.ParseFolderAsync(filePath);
}
}
logger.LogInformation("ParseFullDataAsync - Processing will happen with BasePath: {BasePath}", basePath);
try
else
{
var candidatosDirectory = Path.Combine(basePath, csvSettings.CandidatosFolder);
var bensCandidatosDirectory = Path.Combine(basePath, csvSettings.BensCandidatosFolder);
var redesSociaisDirectory = Path.Combine(basePath, csvSettings.RedesSociaisFolder);
if (Directory.Exists(candidatosDirectory))
{
foreach (var filePath in Directory.GetFiles(candidatosDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFullDataAsync - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFullDataAsync - Parsing candidatos data from {FilePath}", filePath);
await csvParserService.ParseCandidatosAsync(filePath);
}
}
else
{
logger.LogWarning("ParseFullDataAsync - 'Candidatos' directory not found at {Directory}", candidatosDirectory);
}
if (Directory.Exists(bensCandidatosDirectory))
{
foreach (var filePath in Directory.GetFiles(bensCandidatosDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFullDataAsync - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFullDataAsync - Parsing bens candidatos data from {FilePath}", filePath);
await csvParserService.ParseBensCandidatosAsync(filePath);
}
}
else
{
logger.LogWarning("ParseFullDataAsync - 'Bens candidatos' directory not found at {Directory}", bensCandidatosDirectory);
}
if (Directory.Exists(redesSociaisDirectory))
{
foreach (var filePath in Directory.GetFiles(redesSociaisDirectory, "*.csv"))
{
// Check if filePath contains "fix_" prefix
if (filePath.Contains("fix_"))
{
logger.LogInformation("ParseFullDataAsync - Skipping already fixed file: {FilePath}", filePath);
continue;
}
logger.LogInformation("ParseFullDataAsync - Parsing redes sociais data from {FilePath}", filePath);
await csvParserService.ParseRedeSocialAsync(filePath);
}
}
else
{
logger.LogWarning("ParseFullDataAsync - 'Redes sociais' directory not found at {Directory}", redesSociaisDirectory);
}
logger.LogInformation("ParseFullDataAsync - Full data parsing completed!");
}
catch (Exception ex)
{
logger.LogError(ex, "ParseFullDataAsync - Error parsing full data set");
throw;
logger.LogWarning("ParseFolder - Directory not found at {Directory}", csvDirectory);
}
}
}

View File

@ -0,0 +1,52 @@
using Microsoft.Extensions.Logging;
using OpenCand.Core.Models;
using System.Globalization;
using OpenCand.ETL.Contracts;
using OpenCand.Parser.Models;
using OpenCand.Services;
using OpenCand.Parser.Services;
namespace OpenCand.ETL.Parser.ParserServices
{
public class BemCandidatoParserService : IParserService<BemCandidatoCSV>
{
private readonly ILogger<BemCandidatoParserService> logger;
private readonly BemCandidatoService bemCandidatoService;
public BemCandidatoParserService(
ILogger<BemCandidatoParserService> logger,
BemCandidatoService bemCandidatoService)
{
this.logger = logger;
this.bemCandidatoService = bemCandidatoService;
}
public async Task ParseObject(BemCandidatoCSV record)
{
// Parse decimal value
decimal? valor = null;
if (!string.IsNullOrEmpty(record.ValorBemCandidato))
{
string normalizedValue = record.ValorBemCandidato.Replace(".", "").Replace(",", ".");
if (decimal.TryParse(normalizedValue, NumberStyles.Any, CultureInfo.InvariantCulture, out var parsedValue))
{
valor = parsedValue;
}
}
var bemCandidato = new BemCandidato
{
SqCandidato = record.SequencialCandidato,
Ano = record.AnoEleicao,
SiglaUF = record.SiglaUF,
NomeUE = record.NomeUE,
OrdemBem = record.NumeroOrdemBemCandidato,
TipoBem = record.DescricaoTipoBemCandidato,
Descricao = record.DescricaoBemCandidato,
Valor = valor
};
await bemCandidatoService.AddBemCandidatoAsync(bemCandidato);
}
}
}

View File

@ -0,0 +1,100 @@
using System.Globalization;
using Microsoft.Extensions.Logging;
using OpenCand.Core.Models;
using OpenCand.ETL.Contracts;
using OpenCand.Parser.Models;
using OpenCand.Services;
namespace OpenCand.ETL.Parser.ParserServices
{
public class CandidatoParserService : IParserService<CandidatoCSV>
{
private readonly ILogger<CandidatoParserService> logger;
private readonly CandidatoService candidatoService;
public CandidatoParserService(
ILogger<CandidatoParserService> logger,
CandidatoService candidatoService)
{
this.logger = logger;
this.candidatoService = candidatoService;
}
public async Task ParseObject(CandidatoCSV record)
{
if (string.IsNullOrWhiteSpace(record.CPFCandidato) || record.CPFCandidato.Length <= 3)
{
record.CPFCandidato = null; // Handle null/empty/whitespace CPF
}
if (record.NomeCandidato == "NÃO DIVULGÁVEL" ||
string.IsNullOrEmpty(record.NomeCandidato) ||
record.NomeCandidato == "#NULO")
{
logger.LogCritical($"ParseCandidatosAsync - Candidate with id {record.SequencialCandidato} with invalid name, skipping...");
return; // Skip candidates with invalid name
}
if (string.IsNullOrWhiteSpace(record.Apelido) ||
record.Apelido.Contains("#NUL") ||
record.Apelido.Contains("NULO#") ||
record.Apelido.Contains("#NE"))
{
record.Apelido = null;
}
var candidato = new Candidato
{
Cpf = record.CPFCandidato,
SqCandidato = record.SequencialCandidato,
Nome = record.NomeCandidato,
Apelido = record.Apelido,
Email = record.Email.Contains("@") ? record.Email : null,
Sexo = record.Genero,
EstadoCivil = record.EstadoCivil,
Escolaridade = record.GrauInstrucao,
Ocupacao = record.Ocupacao,
Eleicoes = new List<CandidatoMapping>()
{
new CandidatoMapping
{
Cpf = record.CPFCandidato,
Nome = record.NomeCandidato,
Apelido = record.Apelido,
SqCandidato = record.SequencialCandidato,
Ano = record.AnoEleicao,
TipoEleicao = record.TipoAbrangencia,
NomeUE = record.NomeUE,
SiglaUF = record.SiglaUF,
Cargo = record.DescricaoCargo,
NrCandidato = record.NumeroCandidato,
Resultado = record.SituacaoTurno,
Partido = new Partido
{
Sigla = record.SiglaPartido,
Nome = record.NomePartido,
Numero = record.NumeroPartido,
}
}
}
};
if (!string.IsNullOrEmpty(record.DataNascimento) &&
record.DataNascimento != "#NULO")
{
if (DateTime.TryParseExact(record.DataNascimento, "dd/MM/yyyy",
CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out var dataNascimento))
{
// Convert to UTC DateTime to work with PostgreSQL timestamp with time zone
candidato.DataNascimento = DateTime.SpecifyKind(dataNascimento, DateTimeKind.Utc);
}
}
else
{
candidato.DataNascimento = null; // Handle null/empty/whitespace date
}
await candidatoService.AddCandidatoAsync(candidato);
}
}
}

View File

@ -0,0 +1,37 @@
using Microsoft.Extensions.Logging;
using OpenCand.Core.Models;
using OpenCand.ETL.Contracts;
using OpenCand.Parser.Models;
using OpenCand.Parser.Services;
using OpenCand.Services;
namespace OpenCand.ETL.Parser.ParserServices
{
public class RedeSocialParserService : IParserService<RedeSocialCSV>
{
private readonly ILogger<RedeSocialParserService> logger;
private readonly RedeSocialService redeSocialService;
public RedeSocialParserService(
ILogger<RedeSocialParserService> logger,
RedeSocialService redeSocialService)
{
this.logger = logger;
this.redeSocialService = redeSocialService;
}
public async Task ParseObject(RedeSocialCSV record)
{
var redeSocial = new RedeSocial
{
SqCandidato = record.SequencialCandidato,
Ano = record.DataEleicao,
SiglaUF = record.SiglaUF,
Link = record.Url,
Rede = string.Empty
};
await redeSocialService.AddRedeSocialAsync(redeSocial);
}
}
}

View File

@ -1,274 +0,0 @@
using System.Globalization;
using CsvHelper;
using CsvHelper.Configuration;
using Microsoft.Extensions.Logging;
using OpenCand.Core.Models;
using OpenCand.ETL.Parser.CsvMappers;
using OpenCand.Parser.CsvMappers;
using OpenCand.Parser.Models;
using OpenCand.Services;
namespace OpenCand.Parser.Services
{
public class CsvParserService
{
private readonly ILogger<CsvParserService> logger;
private readonly CandidatoService candidatoService;
private readonly BemCandidatoService bemCandidatoService;
private readonly RedeSocialService redeSocialService;
private readonly CsvFixerService csvFixerService;
private readonly CsvConfiguration parserConfig;
public CsvParserService(
ILogger<CsvParserService> logger,
CandidatoService candidatoService,
BemCandidatoService bemCandidatoService,
RedeSocialService redeSocialService,
CsvFixerService csvFixerService)
{
this.logger = logger;
this.candidatoService = candidatoService;
this.bemCandidatoService = bemCandidatoService;
this.redeSocialService = redeSocialService;
this.csvFixerService = csvFixerService;
parserConfig = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";",
HasHeaderRecord = true,
PrepareHeaderForMatch = args => args.Header.ToLower(),
MissingFieldFound = null,
TrimOptions = TrimOptions.Trim,
Encoding = System.Text.Encoding.UTF8
};
}
public async Task ParseCandidatosAsync(string filePath)
{
logger.LogInformation($"ParseCandidatosAsync - Starting to parse 'candidatos' from '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseCandidatosAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
var po = new ParallelOptions
{
MaxDegreeOfParallelism = 25
};
csv.Context.RegisterClassMap<CandidatoMap>();
var records = csv.GetRecords<CandidatoCSV>();
await Parallel.ForEachAsync(records, po, async (record, ct) =>
{
try
{
if (string.IsNullOrWhiteSpace(record.CPFCandidato) || record.CPFCandidato.Length <= 3)
{
record.CPFCandidato = null; // Handle null/empty/whitespace CPF
}
if (record.NomeCandidato == "NÃO DIVULGÁVEL" ||
string.IsNullOrEmpty(record.NomeCandidato) ||
record.NomeCandidato == "#NULO")
{
logger.LogCritical($"ParseCandidatosAsync - Candidate with id {record.SequencialCandidato} with invalid name, skipping...");
return; // Skip candidates with invalid name
}
var candidato = new Candidato
{
Cpf = record.CPFCandidato,
SqCandidato = record.SequencialCandidato,
Nome = record.NomeCandidato,
Email = record.Email.Contains("@") ? record.Email : null,
Sexo = record.Genero,
EstadoCivil = record.EstadoCivil,
Escolaridade = record.GrauInstrucao,
Ocupacao = record.Ocupacao,
Eleicoes = new List<CandidatoMapping>()
{
new CandidatoMapping
{
Cpf = record.CPFCandidato,
Nome = record.NomeCandidato,
SqCandidato = record.SequencialCandidato,
Ano = record.AnoEleicao,
TipoEleicao = record.TipoAbrangencia,
NomeUE = record.NomeUE,
SiglaUF = record.SiglaUF,
Cargo = record.DescricaoCargo,
NrCandidato = record.NumeroCandidato,
Resultado = record.SituacaoTurno,
Partido = new Partido
{
Sigla = record.SiglaPartido,
Nome = record.NomePartido,
Numero = record.NumeroPartido,
}
}
}
};
if (!string.IsNullOrEmpty(record.DataNascimento) &&
record.DataNascimento != "#NULO")
{
if (DateTime.TryParseExact(record.DataNascimento, "dd/MM/yyyy",
CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out var dataNascimento))
{
// Convert to UTC DateTime to work with PostgreSQL timestamp with time zone
candidato.DataNascimento = DateTime.SpecifyKind(dataNascimento, DateTimeKind.Utc);
}
}
else
{
candidato.DataNascimento = null; // Handle null/empty/whitespace date
}
await candidatoService.AddCandidatoAsync(candidato);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseCandidatosAsync - Error processing candidate with id {CandidatoId}", record.SequencialCandidato);
}
});
logger.LogInformation("ParseCandidatosAsync - Finished parsing candidatos from {FilePath}", filePath);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseCandidatosAsync - Error parsing candidatos file {FilePath}", filePath);
throw;
}
}
public async Task ParseBensCandidatosAsync(string filePath)
{
logger.LogInformation($"ParseBensCandidatosAsync - Starting to parse bens candidatos from '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseBensCandidatosAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
csv.Context.RegisterClassMap<BemCandidatoMap>();
var records = csv.GetRecords<BemCandidatoCSV>();
foreach (var record in records)
{
try
{
// Parse decimal value
decimal? valor = null;
if (!string.IsNullOrEmpty(record.ValorBemCandidato))
{
string normalizedValue = record.ValorBemCandidato.Replace(".", "").Replace(",", ".");
if (decimal.TryParse(normalizedValue, NumberStyles.Any, CultureInfo.InvariantCulture, out var parsedValue))
{
valor = parsedValue;
}
}
var bemCandidato = new BemCandidato
{
SqCandidato = record.SequencialCandidato,
Ano = record.AnoEleicao,
SiglaUF = record.SiglaUF,
NomeUE = record.NomeUE,
OrdemBem = record.NumeroOrdemBemCandidato,
TipoBem = record.DescricaoTipoBemCandidato,
Descricao = record.DescricaoBemCandidato,
Valor = valor
};
await bemCandidatoService.AddBemCandidatoAsync(bemCandidato);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseBensCandidatosAsync - Error processing bem candidato with id {CandidatoId} and ordem {OrdemBem}",
record.SequencialCandidato, record.NumeroOrdemBemCandidato);
}
}
logger.LogInformation("ParseBensCandidatosAsync - Finished parsing bens candidatos from {FilePath}", filePath);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseBensCandidatosAsync - Error parsing bens candidatos file {FilePath}", filePath);
throw;
}
}
public async Task ParseRedeSocialAsync(string filePath)
{
logger.LogInformation($"ParseRedeSocialAsync - Starting to parse redes sociais from '{filePath}'");
filePath = csvFixerService.FixCsvFile(filePath);
// Fix the CSV file if necessary
if (string.IsNullOrEmpty(filePath))
{
logger.LogError($"ParseRedeSocialAsync - Failed to fix CSV file at '{filePath}'");
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
}
try
{
using var reader = new StreamReader(filePath);
using var csv = new CsvReader(reader, parserConfig);
csv.Context.RegisterClassMap<RedeSocialMap>();
var records = csv.GetRecords<RedeSocialCSV>();
foreach (var record in records)
{
try
{
var redeSocial = new RedeSocial
{
SqCandidato = record.SequencialCandidato,
Ano = record.DataEleicao,
SiglaUF = record.SiglaUF,
Link = record.Url,
Rede = string.Empty
};
await redeSocialService.AddRedeSocialAsync(redeSocial);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseRedeSocialAsync - Error processing redes sociais with id {SequencialCandidato} and link {Url}",
record.SequencialCandidato, record.Url);
}
}
logger.LogInformation("ParseRedeSocialAsync - Finished parsing redes sociais from {FilePath}", filePath);
}
catch (Exception ex)
{
logger.LogError(ex, "ParseRedeSocialAsync - Error parsing redes sociais file {FilePath}", filePath);
throw;
}
}
}
}

View File

@ -3,8 +3,11 @@ using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using OpenCand.Config;
using OpenCand.ETL.Contracts;
using OpenCand.ETL.Parser.ParserServices;
using OpenCand.ETL.Repository;
using OpenCand.Parser;
using OpenCand.Parser.Models;
using OpenCand.Parser.Services;
using OpenCand.Repository;
using OpenCand.Services;
@ -24,9 +27,6 @@ namespace OpenCand
try
{
logger.LogInformation("Initializing database");
// make a test connection to the database
logger.LogInformation("Starting data parsing");
var parserManager = services.GetRequiredService<ParserManager>();
await parserManager.ParseFullDataAsync();
@ -55,9 +55,14 @@ namespace OpenCand
{
// Configuration
services.Configure<CsvSettings>(hostContext.Configuration.GetSection("CsvSettings"));
// Services
services.AddTransient<CsvParserService>();
services.AddTransient<IParserService<CandidatoCSV>, CandidatoParserService>();
services.AddTransient<IParserService<BemCandidatoCSV>, BemCandidatoParserService>();
services.AddTransient<IParserService<RedeSocialCSV>, RedeSocialParserService>();
services.AddTransient<CsvParserService<CandidatoCSV>>();
services.AddTransient<CsvParserService<BemCandidatoCSV>>();
services.AddTransient<CsvParserService<RedeSocialCSV>>();
services.AddTransient<ParserManager>();
services.AddTransient<CandidatoService>();
services.AddTransient<BemCandidatoService>();

View File

@ -16,8 +16,8 @@ namespace OpenCand.Repository
using (var connection = new NpgsqlConnection(ConnectionString))
{
await connection.ExecuteAsync(@"
INSERT INTO candidato (idcandidato, cpf, nome, datanascimento, email, sexo, estadocivil, escolaridade, ocupacao)
VALUES (@idcandidato, @cpf, @nome, @datanascimento, @email, @sexo, @estadocivil, @escolaridade, @ocupacao)
INSERT INTO candidato (idcandidato, cpf, nome, apelido, datanascimento, email, sexo, estadocivil, escolaridade, ocupacao)
VALUES (@idcandidato, @cpf, @nome, @apelido, @datanascimento, @email, @sexo, @estadocivil, @escolaridade, @ocupacao)
ON CONFLICT (idcandidato) DO UPDATE SET
cpf = EXCLUDED.cpf,
nome = EXCLUDED.nome,
@ -26,12 +26,14 @@ namespace OpenCand.Repository
sexo = EXCLUDED.sexo,
estadocivil = EXCLUDED.estadocivil,
escolaridade = EXCLUDED.escolaridade,
ocupacao = EXCLUDED.ocupacao;",
ocupacao = EXCLUDED.ocupacao,
apelido = EXCLUDED.apelido;",
new
{
idcandidato = candidato.IdCandidato,
cpf = candidato.Cpf,
nome = candidato.Nome,
apelido = candidato.Apelido,
datanascimento = candidato.DataNascimento,
email = candidato.Email,
sexo = candidato.Sexo,
@ -47,13 +49,15 @@ namespace OpenCand.Repository
using (var connection = new NpgsqlConnection(ConnectionString))
{
await connection.ExecuteAsync(@"
INSERT INTO candidato_mapping (idcandidato, cpf, nome, sqcandidato, ano, tipoeleicao, siglauf, nomeue, cargo, nrcandidato, sgpartido, resultado)
VALUES (@idcandidato, @cpf, @nome, @sqcandidato, @ano, @tipoeleicao, @siglauf, @nomeue, @cargo, @nrcandidato, @sgpartido, @resultado);",
INSERT INTO candidato_mapping (idcandidato, cpf, nome, apelido, sqcandidato, ano, tipoeleicao, siglauf, nomeue, cargo, nrcandidato, sgpartido, resultado)
VALUES (@idcandidato, @cpf, @nome, @apelido, @sqcandidato, @ano, @tipoeleicao, @siglauf, @nomeue, @cargo, @nrcandidato, @sgpartido, @resultado)
ON CONFLICT DO NOTHING;",
new
{
idcandidato = candidatoMapping.IdCandidato,
cpf = candidatoMapping.Cpf,
nome = candidatoMapping.Nome,
apelido = candidatoMapping.Apelido,
sqcandidato = candidatoMapping.SqCandidato,
ano = candidatoMapping.Ano,
tipoeleicao = candidatoMapping.TipoEleicao,
@ -67,27 +71,45 @@ namespace OpenCand.Repository
}
}
public async Task<List<CandidatoMapping>?> GetCandidatoMappingByCpf(string cpf)
public async Task<Candidato?> GetCandidatoByCpf(string cpf)
{
using (var connection = new NpgsqlConnection(ConnectionString))
{
var query = @"
SELECT idcandidato, cpf, nome, sqcandidato, ano, tipoeleicao, siglauf, nomeue, cargo, nrcandidato, resultado
FROM candidato_mapping
SELECT *
FROM candidato
WHERE cpf = @cpf";
return (await connection.QueryAsync<CandidatoMapping>(query, new { cpf })).AsList();
return await connection.QueryFirstOrDefaultAsync<Candidato>(query, new { cpf });
}
}
public async Task<List<CandidatoMapping>?> GetCandidatoMappingByNome(string nome, string sqcandidato, int ano, string siglauf, string nomeue, string nrcandidato)
public async Task<Candidato?> GetCandidatoByNome(string nome, DateTime datanascimento)
{
using (var connection = new NpgsqlConnection(ConnectionString))
{
var query = @"
SELECT idcandidato, cpf, nome, sqcandidato, ano, tipoeleicao, siglauf, nomeue, cargo, nrcandidato, resultado
SELECT *
FROM candidato
WHERE nome = @nome AND datanascimento = @datanascimento";
return await connection.QueryFirstOrDefaultAsync<Candidato>(query, new { nome, datanascimento });
}
}
public async Task<CandidatoMapping?> GetCandidatoMappingByDetails(string nome, int ano, string cargo, string siglauf, string nomeue, string nrcandidato, string resultado)
{
using (var connection = new NpgsqlConnection(ConnectionString))
{
var query = @"
SELECT *
FROM candidato_mapping
WHERE nome = @nome AND sqcandidato = @sqcandidato AND ano = @ano AND siglauf = @siglauf AND nomeue = @nomeue AND nrcandidato = @nrcandidato";
return (await connection.QueryAsync<CandidatoMapping>(query, new { nome, sqcandidato, ano, siglauf, nomeue, nrcandidato })).AsList();
WHERE nome = @nome AND
ano = @ano AND
cargo = @cargo AND
siglauf = @siglauf AND
nomeue = @nomeue AND
nrcandidato = @nrcandidato AND
resultado = @resultado";
return await connection.QueryFirstOrDefaultAsync<CandidatoMapping>(query, new { nome, ano, cargo, siglauf, nomeue, nrcandidato, resultado });
}
}

View File

@ -35,57 +35,59 @@ namespace OpenCand.Services
await partidoRepository.AddPartidoAsync(candidatoMapping.Partido);
}
List<CandidatoMapping>? mappings = null;
CandidatoMapping? existingMapping = null;
// Check if the candidate already exists in the database
Candidato? existingCandidato;
if (candidato.Cpf == null || candidato.Cpf.Length != 11)
{
// If CPF is not provided or invalid, we STRICTLY search by name and other properties
mappings = await candidatoRepository.GetCandidatoMappingByNome(candidato.Nome,
candidato.SqCandidato,
candidatoMapping.Ano,
candidatoMapping.SiglaUF,
candidatoMapping.NomeUE,
candidatoMapping.NrCandidato);
existingCandidato = await candidatoRepository.GetCandidatoByNome(candidato.Nome, candidato.DataNascimento.GetValueOrDefault());
}
else
{
mappings = await candidatoRepository.GetCandidatoMappingByCpf(candidato.Cpf);
existingCandidato = await candidatoRepository.GetCandidatoByCpf(candidato.Cpf);
}
// If the candidate already exists, we can update the mappings
if (existingCandidato != null)
{
candidato.IdCandidato = existingCandidato.IdCandidato;
candidato.Cpf = GetNonEmptyString(existingCandidato.Cpf, candidato.Cpf);
candidato.Email = GetNonEmptyString(existingCandidato.Email, candidato.Email);
candidato.EstadoCivil = GetNonEmptyString(existingCandidato.EstadoCivil, candidato.EstadoCivil);
candidato.Apelido = GetNonEmptyString(existingCandidato.Apelido, candidato.Apelido);
candidato.Escolaridade = GetNonEmptyString(existingCandidato.Escolaridade, candidato.Escolaridade);
candidato.Ocupacao = GetNonEmptyString(existingCandidato.Ocupacao, candidato.Ocupacao);
candidato.Sexo = GetNonEmptyString(existingCandidato.Sexo, candidato.Sexo);
candidatoMapping.IdCandidato = candidato.IdCandidato;
candidatoMapping.Cpf = GetNonEmptyString(candidato.Cpf, candidatoMapping.Cpf);
// Update the entries for the existing candidate
await candidatoRepository.AddCandidatoAsync(candidato);
await candidatoRepository.AddCandidatoMappingAsync(candidatoMapping);
return;
}
// If the candidate does not exist, we create a new one
CandidatoMapping? existingMapping = await candidatoRepository.GetCandidatoMappingByDetails(candidato.Nome,
candidatoMapping.Ano,
candidatoMapping.Cargo,
candidatoMapping.SiglaUF,
candidatoMapping.NomeUE,
candidatoMapping.NrCandidato,
candidatoMapping.Resultado);
// Check if exists
if (mappings != null && mappings.Count > 0)
if (existingMapping != null)
{
existingMapping = mappings.FirstOrDefault(m => m.Ano == candidatoMapping.Ano &&
m.Cargo == candidatoMapping.Cargo &&
m.SiglaUF == candidatoMapping.SiglaUF &&
m.NomeUE == candidatoMapping.NomeUE &&
m.NrCandidato == candidatoMapping.NrCandidato &&
m.Resultado == candidatoMapping.Resultado);
candidato.IdCandidato = existingMapping.IdCandidato;
candidato.Cpf = GetNonEmptyString(existingMapping.Cpf, candidato.Cpf);
candidato.Apelido = GetNonEmptyString(existingMapping.Apelido, candidato.Apelido);
await candidatoRepository.AddCandidatoAsync(candidato);
// Already exists one for the current election
if (existingMapping != null)
{
candidato.IdCandidato = existingMapping.IdCandidato;
candidato.Cpf = existingMapping.Cpf;
return;
}
await candidatoRepository.AddCandidatoAsync(candidato);
return;
}
// If exists (but not for the current election), we take the existing idcandidato
// and create a new mapping for the current election
else
{
existingMapping = mappings.First();
candidato.IdCandidato = existingMapping.IdCandidato;
candidato.Cpf = existingMapping.Cpf;
}
}
else
{
// No current mapping, we create a new one
// and create a new mapping for the current election
candidato.IdCandidato = Guid.NewGuid();
}
// No current mapping, we create a new one and create a new mapping for the current election
candidato.IdCandidato = Guid.NewGuid();
// Set the mapping properties
candidatoMapping.IdCandidato = candidato.IdCandidato;
@ -96,5 +98,18 @@ namespace OpenCand.Services
await candidatoRepository.AddCandidatoMappingAsync(candidatoMapping);
}
public string GetNonEmptyString(string? value1, string? value2)
{
if (!string.IsNullOrWhiteSpace(value1))
{
return value1;
}
else if (!string.IsNullOrWhiteSpace(value2))
{
return value2;
}
return string.Empty;
}
}
}

View File

@ -8,6 +8,7 @@ CREATE TABLE candidato (
idcandidato UUID NOT NULL PRIMARY KEY,
cpf VARCHAR(11),
nome VARCHAR(255) NOT NULL,
apelido VARCHAR(255),
datanascimento TIMESTAMPTZ,
email TEXT,
sexo CHAR(15),
@ -16,19 +17,21 @@ CREATE TABLE candidato (
ocupacao VARCHAR(150)
);
CREATE INDEX idx_candidato_nome ON candidato (nome);
CREATE INDEX idx_candidato_apelido ON candidato (apelido);
-- Each candidato (idcandidato, cpf, nome) will be mapped to a (sqcandidato, ano, tipo_eleicao, sg_uf, cargo, resultado)
CREATE TABLE candidato_mapping (
idcandidato UUID NOT NULL,
cpf VARCHAR(11),
nome VARCHAR(255) NOT NULL,
apelido VARCHAR(255),
sqcandidato TEXT,
ano INT NOT NULL,
tipoeleicao VARCHAR(50),
siglauf VARCHAR(2),
nomeue VARCHAR(100),
cargo VARCHAR(50),
sgpartido VARCHAR(10),
sgpartido VARCHAR(50),
nrcandidato VARCHAR(20),
resultado VARCHAR(50),
CONSTRAINT pk_candidato_mapping PRIMARY KEY (idcandidato, ano, siglauf, nomeue, cargo, nrcandidato, resultado),
@ -36,6 +39,7 @@ CREATE TABLE candidato_mapping (
);
CREATE INDEX idx_candidato_mapping_cpf ON candidato_mapping (cpf);
CREATE INDEX idx_candidato_mapping_nome ON candidato_mapping (nome);
CREATE INDEX idx_candidato_mapping_apelido ON candidato_mapping (apelido);
CREATE INDEX idx_candidato_mapping_ano ON candidato_mapping (ano);
CREATE INDEX idx_candidato_mapping_sqcandidato ON candidato_mapping (sqcandidato);
@ -67,7 +71,7 @@ CREATE INDEX idx_rede_social_idcandidato ON rede_social (idcandidato);
---- Table for storing party information
CREATE TABLE partido (
sigla VARCHAR(10) NOT NULL PRIMARY KEY,
sigla VARCHAR(50) NOT NULL PRIMARY KEY,
nome VARCHAR(255) NOT NULL,
numero INT NOT NULL
);