stuff and refactor
This commit is contained in:
118
OpenCand.ETL/Parser/CsvServices/CsvFixerService.cs
Normal file
118
OpenCand.ETL/Parser/CsvServices/CsvFixerService.cs
Normal file
@@ -0,0 +1,118 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace OpenCand.Parser.Services
|
||||
{
|
||||
public class CsvFixerService
|
||||
{
|
||||
private readonly ILogger<CsvFixerService> logger;
|
||||
|
||||
public CsvFixerService(
|
||||
ILogger<CsvFixerService> logger)
|
||||
{
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
public string FixCsvFile(string filePath)
|
||||
{
|
||||
var filename = Path.GetFileName(filePath);
|
||||
var path = Path.GetDirectoryName(filePath);
|
||||
|
||||
// Check if the file exists
|
||||
if (!File.Exists(filePath))
|
||||
{
|
||||
logger.LogError($"FixCsvFile - The file at '{filePath}' does not exist");
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
if (string.IsNullOrEmpty(filename) || string.IsNullOrEmpty(path))
|
||||
{
|
||||
logger.LogError($"FixCsvFile - The file path '{filePath}' is invalid");
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
// Fixed file will have the same name but with "fix_" prefix
|
||||
var newFilePath = Path.Combine(path, $"fix_{filename}");
|
||||
if (File.Exists(newFilePath))
|
||||
{
|
||||
logger.LogWarning($"FixCsvFile - A fixed file already exists at '{newFilePath}'. It will be overwritten.");
|
||||
}
|
||||
|
||||
logger.LogInformation($"FixCsvFile - Starting to fix CSV file at '{filePath}'");
|
||||
|
||||
try
|
||||
{
|
||||
// Read the file
|
||||
var lines = File.ReadAllLines(filePath, encoding: Encoding.GetEncoding(1252));
|
||||
|
||||
if (lines.Length == 0)
|
||||
{
|
||||
logger.LogError($"FixCsvFile - The file at '{filePath}' is empty");
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var newLines = new List<string>();
|
||||
|
||||
var headerCount = lines[0].Split(';').Length;
|
||||
|
||||
if (headerCount == 0)
|
||||
{
|
||||
logger.LogError($"FixCsvFile - The first line of the file at '{filePath}' does not contain any headers");
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
logger.LogInformation($"FixCsvFile - Detected {headerCount} headers in the CSV file");
|
||||
|
||||
for (int i = 0; i < lines.Length;)
|
||||
{
|
||||
var line = lines[i];
|
||||
var columns = line.Split(';');
|
||||
var lineJump = 1;
|
||||
|
||||
while (columns.Length != headerCount)
|
||||
{
|
||||
if (columns.Length > headerCount)
|
||||
{
|
||||
logger.LogCritical($"FixCsvFile - Line {i + 1} has {columns.Length} columns, expected {headerCount}. Halting process.");
|
||||
return string.Empty; // Critical error, cannot fix this line => needs manual intervention
|
||||
}
|
||||
|
||||
logger.LogWarning($"FixCsvFile - Line {i + 1} has {columns.Length} columns, expected {headerCount}. Attempting to fix [i = {lineJump}]...");
|
||||
|
||||
// Likely the "original line" had some \n that were processed incorrectly
|
||||
// Append lines[i + 1] to the current line and re-do the check
|
||||
|
||||
if (i + lineJump >= lines.Length)
|
||||
{
|
||||
logger.LogCritical($"FixCsvFile - Reached the end of the file while trying to fix line {i + 1}. Cannot continue.");
|
||||
return string.Empty; // Cannot fix this line, reached the end of the file
|
||||
}
|
||||
|
||||
// Append the next line to the current line
|
||||
line += lines[i + lineJump];
|
||||
|
||||
// Re-split the line to check the number of columns again
|
||||
columns = line.Split(';');
|
||||
|
||||
// increment lineJump
|
||||
lineJump++;
|
||||
}
|
||||
|
||||
newLines.Add(line);
|
||||
i += lineJump;
|
||||
}
|
||||
|
||||
// Write the fixed lines to the new filepath
|
||||
File.WriteAllLines(newFilePath, newLines, Encoding.UTF8);
|
||||
|
||||
logger.LogInformation($"FixCsvFile - Successfully fixed CSV file at {newFilePath}");
|
||||
return newFilePath;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogError(ex, $"FixCsvFile - Error fixing CSV file at {filePath}");
|
||||
return string.Empty;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
150
OpenCand.ETL/Parser/CsvServices/CsvParserService.cs
Normal file
150
OpenCand.ETL/Parser/CsvServices/CsvParserService.cs
Normal file
@@ -0,0 +1,150 @@
|
||||
using System.Globalization;
|
||||
using CsvHelper;
|
||||
using CsvHelper.Configuration;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using OpenCand.ETL.Contracts;
|
||||
|
||||
namespace OpenCand.Parser.Services
|
||||
{
|
||||
public class CsvParserService<CsvObj> : IDisposable
|
||||
{
|
||||
private readonly ILogger<CsvParserService<CsvObj>> logger;
|
||||
private readonly CsvFixerService csvFixerService;
|
||||
private readonly IParserService<CsvObj> parserService;
|
||||
|
||||
private readonly CsvConfiguration parserConfig;
|
||||
|
||||
// Progress tracking fields
|
||||
private long processedCount;
|
||||
private long totalCount;
|
||||
private string currentTask = string.Empty;
|
||||
private Timer? progressTimer;
|
||||
private readonly object progressLock = new object();
|
||||
|
||||
public CsvParserService(
|
||||
ILogger<CsvParserService<CsvObj>> logger,
|
||||
IParserService<CsvObj> parserService,
|
||||
CsvFixerService csvFixerService)
|
||||
{
|
||||
this.logger = logger;
|
||||
this.csvFixerService = csvFixerService;
|
||||
this.parserService = parserService;
|
||||
|
||||
parserConfig = new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||
{
|
||||
Delimiter = ";",
|
||||
HasHeaderRecord = true,
|
||||
PrepareHeaderForMatch = args => args.Header.ToLower(),
|
||||
MissingFieldFound = null,
|
||||
TrimOptions = TrimOptions.Trim,
|
||||
Encoding = System.Text.Encoding.UTF8
|
||||
};
|
||||
}
|
||||
|
||||
public async Task ParseFolderAsync(string filePath)
|
||||
{
|
||||
logger.LogInformation($"ParseFolderAsync - Starting to parse '{filePath}'");
|
||||
|
||||
filePath = csvFixerService.FixCsvFile(filePath);
|
||||
|
||||
// Fix the CSV file if necessary
|
||||
if (string.IsNullOrEmpty(filePath))
|
||||
{
|
||||
logger.LogError($"ParseFolderAsync - Failed to fix CSV file at '{filePath}'");
|
||||
throw new InvalidOperationException($"Failed to fix CSV file at '{filePath}'");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var reader = new StreamReader(filePath);
|
||||
using var csv = new CsvReader(reader, parserConfig);
|
||||
var po = new ParallelOptions
|
||||
{
|
||||
MaxDegreeOfParallelism = 40
|
||||
};
|
||||
|
||||
//csv.Context.RegisterClassMap<ClassMap<CsvObj>>(); // optional for advanced mapping, not needed
|
||||
var records = csv.GetRecords<CsvObj>().ToList();
|
||||
|
||||
StartProgressTracking($"Parsing {nameof(CsvObj)} - {Path.GetFileName(filePath)}", records.Count);
|
||||
|
||||
await Parallel.ForEachAsync(records, po, async (record, ct) =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await parserService.ParseObject(record);
|
||||
|
||||
// Increment progress
|
||||
IncrementProgress();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogError(ex, $"ParseFolderAsync - Error processing:");
|
||||
IncrementProgress();
|
||||
}
|
||||
});
|
||||
|
||||
StopProgressTracking();
|
||||
|
||||
logger.LogInformation($"ParseFolderAsync - Finished parsing from {filePath}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogError(ex, $"ParseFolderAsync - Error parsing file {filePath}");
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress tracking methods
|
||||
private void StartProgressTracking(string taskName, long total)
|
||||
{
|
||||
lock (progressLock)
|
||||
{
|
||||
currentTask = taskName;
|
||||
processedCount = 0;
|
||||
totalCount = total;
|
||||
|
||||
progressTimer?.Dispose();
|
||||
progressTimer = new Timer(LogProgress, null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
|
||||
|
||||
logger.LogInformation("Progress - Task: {Task}, Total: {Total}", currentTask, totalCount);
|
||||
}
|
||||
}
|
||||
|
||||
private void IncrementProgress()
|
||||
{
|
||||
Interlocked.Increment(ref processedCount);
|
||||
}
|
||||
|
||||
private void StopProgressTracking()
|
||||
{
|
||||
lock (progressLock)
|
||||
{
|
||||
progressTimer?.Dispose();
|
||||
progressTimer = null;
|
||||
|
||||
// Log final progress
|
||||
var percentage = totalCount > 0 ? (double)processedCount / totalCount * 100 : 0;
|
||||
logger.LogInformation("Progress - Task: {Task}, Processed: {Processed}, Total: {Total}, Progress: {Percentage:F2}%",
|
||||
currentTask, processedCount, totalCount, percentage);
|
||||
}
|
||||
}
|
||||
|
||||
private void LogProgress(object? state)
|
||||
{
|
||||
lock (progressLock)
|
||||
{
|
||||
if (string.IsNullOrEmpty(currentTask)) return;
|
||||
|
||||
var percentage = totalCount > 0 ? (double)processedCount / totalCount * 100 : 0;
|
||||
logger.LogInformation("Progress - Task: {Task}, Processed: {Processed}, Total: {Total}, Progress: {Percentage:F2}%",
|
||||
currentTask, processedCount, totalCount, percentage);
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
progressTimer?.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user