-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCSF.cs
298 lines (284 loc) · 12.8 KB
/
CSF.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
using System.Text;
using System.Text.RegularExpressions;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
namespace CSFExtractor
{
public partial class CSF
{
private string Text = "";
private const string Moral = "Moral";
private const string Physical = "Física";
public string FileName { get; set; }
public string RFC { get; set; }
public string FullName { get; set; }
public string Name { get; set; }
public string Surname { get; set; }
public string LastSurname { get; set; }
public string CIF { get; set; }
public string Date { get; set; }
public string CURP { get; set; }
public string Started { get; set; }
public string Status { get; set; }
public string LastChange { get; set; }
public string Type { get; set; }
public string Postal { get; set; }
public string StreetType { get; set; }
public string Street { get; set; }
public string Number { get; set; }
public string IntNumber { get; set; }
public string Colony { get; set; }
public string Locality { get; set; }
public string Municipality { get; set; }
public string State { get; set; }
public string Between { get; set; }
public string AndBetween { get; set; }
public string Email { get; set; }
public string AreaCode { get; set; }
public string Phone { get; set; }
public string RegimeCount { get; set; }
public string Regime1 { get; set; }
public string RegimeDate1 { get; set; }
public string Regime2 { get; set; }
public string RegimeDate2 { get; set; }
public string Regime3 { get; set; }
public string RegimeDate3 { get; set; }
public string Regime4 { get; set; }
public string RegimeDate4 { get; set; }
public string Regime5 { get; set; }
public string RegimeDate5 { get; set; }
public string Regime6 { get; set; }
public string RegimeDate6 { get; set; }
public string Regime7 { get; set; }
public string RegimeDate7 { get; set; }
public CSF()
{
FileName = "";
RFC = "";
FullName = "";
Name = "";
Surname = "";
LastSurname = "";
CIF = "";
Date = "";
CURP = "";
Started = "";
Status = "";
LastChange = "";
Type = "";
Postal = "";
StreetType = "";
Street = "";
Number = "";
IntNumber = "";
Colony = "";
Locality = "";
Municipality = "";
State = "";
Between = "";
AndBetween = "";
Email = "";
Phone = "";
AreaCode = "";
RegimeCount = "";
Regime1 = "";
RegimeDate1 = "";
Regime2 = "";
RegimeDate2 = "";
Regime3 = "";
RegimeDate3 = "";
Regime4 = "";
RegimeDate4 = "";
Regime5 = "";
RegimeDate5 = "";
Regime6 = "";
RegimeDate6 = "";
Regime7 = "";
RegimeDate7 = "";
}
public static CSF Load(string FileName) {
try
{
if (!File.Exists(FileName))
{
throw new FileNotFoundException(FileName);
}
CSF result = new();
var sb = new StringBuilder();
result.FileName = Path.GetFileName(FileName);
using PdfDocument document = PdfDocument.Open(FileName);
foreach (Page page in document.GetPages())
{
result.Text += ContentOrderTextExtractor.GetText(page, true);
}
result.Text = result.Text.ReplaceLineEndings(" ");
result.Text = CleanerRegex().Replace(result.Text, "");
if (!CSFString().IsMatch(result.Text))
{
throw new Exception("El archivo no parece ser una Constancia de Situación Fiscal");
}
// Probably we must use more texts to detect if is a Moral CSF
if (TypeRegex().IsMatch(result.Text))
{
result.Type = Physical;
result.Name = NamePhysicalRegex().Match(result.Text).Groups[1].Value.Trim();
result.Surname = SurnameRegex().Match(result.Text).Groups[1].Value.Trim();
result.LastSurname = LastSurnameRegex().Match(result.Text).Groups[1].Value.Trim();
}
else
{
result.Type = Moral;
result.Name = NameMoralRegex().Match(result.Text).Groups[1].Value.Trim();
}
result.FullName = FullNameRegex().Match(result.Text).Groups[1].Value.Trim();
result.CIF = CIFRegex().Match(result.Text).Groups[1].Value.Trim();
result.RFC = RFCRegex().Match(result.Text).Groups[1].Value.Trim();
result.CURP = CURPRegex().Match(result.Text).Groups[1].Value.Trim();
result.Date = DateRegex().Match(result.Text).Groups[1].Value.Replace(result.RFC, "").Trim();
result.Status = StatusRegex().Match(result.Text).Groups[1].Value.Trim();
result.Started = StartedRegex().Match(result.Text).Groups[1].Value.Trim();
result.LastChange = LastChangeRegex().Match(result.Text).Groups[1].Value.Trim();
result.StreetType = StreetTypeRegex().Match(result.Text).Groups[1].Value.Trim();
result.Street = StreetRegex().Match(result.Text).Groups[1].Value.Trim();
result.Postal = PostalRegex().Match(result.Text).Groups[1].Value.Trim();
result.Number = ExternalNumberRegex().Match(result.Text).Groups[1].Value.Trim();
result.IntNumber = InternalNumberRegex().Match(result.Text).Groups[1].Value.Trim();
result.Colony = ColonyRegex().Match(result.Text).Groups[1].Value.Trim();
result.Locality = LocalityRegex().Match(result.Text).Groups[1].Value.Trim();
result.Municipality = MunicipalityRegex().Match(result.Text).Groups[1].Value.Trim();
result.State = StateRegex().Match(result.Text).Groups[1].Value.Trim();
result.Between = BetweenRegex().Match(result.Text).Groups[1].Value.Trim();
result.AndBetween = AndBetweenRegex().Match(result.Text).Groups[1].Value.Trim();
result.Email = EmailRegex().Match(result.Text).Groups[1].Value.Trim();
result.Phone = PhoneRegex().Match(result.Text).Groups[1].Value.Trim();
result.AreaCode = AreaCodeRegex().Match(result.Text).Groups[1].Value.Trim();
var regimesRaw = RegimesRegex().Match(result.Text).Groups[1].Value.Trim();
var regimeNames = DateFormatRegex().Split(regimesRaw).Where(r =>
{
return r.Length > 0;
}).ToArray();
var regimeDates = FindDatesRegex().Matches(regimesRaw).ToArray();
result.RegimeCount = regimeNames.Length.ToString();
var counter = regimeNames.Length;
var current = 0;
if (counter > current)
{
result.Regime1 = regimeNames[current].Trim();
result.RegimeDate1 = regimeDates[current].Groups[1].Value.Trim();
}
current++;
if (counter > current)
{
result.Regime2 = regimeNames[current].Trim();
result.RegimeDate2 = regimeDates[current].Groups[1].Value.Trim();
}
current++;
if (counter > current)
{
result.Regime3 = regimeNames[current].Trim();
result.RegimeDate3 = regimeDates[current].Groups[1].Value.Trim();
}
current++;
if (counter > current)
{
result.Regime4 = regimeNames[current].Trim();
result.RegimeDate4 = regimeDates[current].Groups[1].Value.Trim();
}
current++;
if (counter > current)
{
result.Regime5 = regimeNames[current].Trim();
result.RegimeDate5 = regimeDates[current].Groups[1].Value.Trim();
}
current++;
if (counter > current)
{
result.Regime6 = regimeNames[current].Trim();
result.RegimeDate6 = regimeDates[current].Groups[1].Value.Trim();
}
current++;
if (counter > current)
{
result.Regime7 = regimeNames[current].Trim();
result.RegimeDate7 = regimeDates[current].Groups[1].Value.Trim();
}
return result;
}
catch
{
throw;
}
}
public string GetText()
{
return Text;
}
// Document cleaner regex
[GeneratedRegex("Página.{1,2}\\[\\d\\] de \\[\\d\\]")]
private static partial Regex CleanerRegex();
// Document detection regexs
[GeneratedRegex("CONSTANCIA DE SITUACIÓN FISCAL")]
private static partial Regex CSFString();
[GeneratedRegex("Primer Apellido:")]
private static partial Regex TypeRegex();
[GeneratedRegex("Contribuyentes (.*) Nombre, denominación")]
private static partial Regex FullNameRegex();
[GeneratedRegex("Nombre .s.: (.*) Primer")]
private static partial Regex NamePhysicalRegex();
[GeneratedRegex("Social: (.*) Régimen Capital:")]
private static partial Regex NameMoralRegex();
[GeneratedRegex("Apellido: (.*) Segundo")]
private static partial Regex SurnameRegex();
[GeneratedRegex("Segundo Apellido: (.*) Fecha i")]
private static partial Regex LastSurnameRegex();
[GeneratedRegex("idCIF: (.*) VALIDA")]
private static partial Regex CIFRegex();
[GeneratedRegex("RFC: (\\w{12,13})")]
private static partial Regex RFCRegex();
[GeneratedRegex("CURP: (.{18})")]
private static partial Regex CURPRegex();
[GeneratedRegex("Fecha de Emisión (.*) Datos de Iden")]
private static partial Regex DateRegex();
[GeneratedRegex("padrón: (\\w*)")]
private static partial Regex StatusRegex();
[GeneratedRegex("operaciones: (.*) Estatus")]
private static partial Regex StartedRegex();
[GeneratedRegex("estado: (\\d{2} \\w+ \\w+ \\w+ \\d{4})")]
private static partial Regex LastChangeRegex();
[GeneratedRegex("Vialidad: (.*) Nombre de V")]
private static partial Regex StreetTypeRegex();
[GeneratedRegex("Nombre de Vialidad: (.+) Número Exte")]
private static partial Regex StreetRegex();
[GeneratedRegex("\\D{1}(\\d{5}) ")]
private static partial Regex PostalRegex();
[GeneratedRegex("Exterior: (.*) Número Interior:")]
private static partial Regex ExternalNumberRegex();
[GeneratedRegex("Interior:(.*)Nombre de la Co")]
private static partial Regex InternalNumberRegex();
[GeneratedRegex("Colonia:(.*) Nombre de la Loca")]
private static partial Regex ColonyRegex();
[GeneratedRegex("Localidad:(.*)Nombre del")]
private static partial Regex LocalityRegex();
[GeneratedRegex("Territorial:(.*)Nombre de la Entidad")]
private static partial Regex MunicipalityRegex();
[GeneratedRegex("Federativa:(.*)Entre Ca")]
private static partial Regex StateRegex();
[GeneratedRegex("Entre Calle:(.*)Y Calle:")]
private static partial Regex BetweenRegex();
[GeneratedRegex("Y Calle:(.*)Correo")]
private static partial Regex AndBetweenRegex();
[GeneratedRegex("Electrónico:(.*@.+) Tel")]
private static partial Regex EmailRegex();
[GeneratedRegex("Número: (\\d+).?Estado")]
private static partial Regex PhoneRegex();
[GeneratedRegex("Lada: (\\d+).?Número")]
private static partial Regex AreaCodeRegex();
[GeneratedRegex(" Régimen Fecha Inicio Fecha Fin \\d?(.*?) (Obligaciones|Sus datos)")]
private static partial Regex RegimesRegex();
[GeneratedRegex("\\d{2}/\\d{2}/\\d{4}")]
private static partial Regex DateFormatRegex();
[GeneratedRegex("(\\d{2}/\\d{2}/\\d{4})")]
private static partial Regex FindDatesRegex();
}
}