-
Notifications
You must be signed in to change notification settings - Fork 42
feat: Handle complex/nested data types from a parquet file in Parquet Data Source: Struct, Repeated Struct, Maps and Timestamp of type SimpleGroup #148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b0a93f3
d27ce82
099287c
3419983
2cba26a
480603c
0e6a3bc
02867bd
c264de7
e9d0d4c
56315b1
c82d0be
0ed5f41
1a6f12d
d9ae93d
7690b89
189fb7a
b83aba3
1bd9598
e73d736
ab33372
d99e1ea
b2fd657
a3223a2
dc45553
2f96b4c
374f98c
c7d4ea7
cd2a143
5b52214
3cc0828
359cbef
bfde8a4
45f9789
bc829d3
fde86ac
55ce9fc
1a243a5
2e073bc
8ccb516
e82bea0
f70ee38
a490dbd
0c43843
9b89593
a2a8c2c
a85b07e
17a7284
1f32b54
d01fb95
e0705be
960886c
071122f
4c4dbe2
5200fbd
9ba9c2c
255814e
4b375c6
bec88e9
20890f6
9a30c92
c4a7651
7104a05
c847670
8b3bd8d
a28e7a7
651583f
f2de48a
4cb77f7
2ef46ec
5360b14
91afc8f
6037afc
a236b3b
a8d3278
3f7582c
07e86c5
09cb1af
8375565
cd83662
13cac7e
69c5cb9
2d566df
908a7aa
dc7cff9
0ff1de6
357a4f5
5a03736
56d1dcc
0a64f11
e3321d8
95e334c
437ecc1
0aaf743
dc1286c
057f46f
66727f8
c9b773a
945d21c
8be0e78
cfa4990
0972a1f
cba471b
9606844
b51fbaa
c5ff670
ab992c3
e732769
e372b31
4830b1a
35736bb
fadee0b
daebbfe
f2abffb
19612ed
ae6bf6b
d61f6e3
6cad4ee
a0cb47d
c518d0f
bde26e2
05763f9
6d58b98
6523779
404f476
5eb3b55
556ccec
642a0ea
0a44559
514793d
e298aed
5cae0f4
66dcf95
539dbd0
f6cf116
7b51bb6
a2c0c5c
1850a61
83489dd
4fa0986
3877689
1714c4d
18d2752
4034cba
0cef54c
ad796c5
4a2fb73
711e95b
9564169
8514ef0
a698cbb
93615e7
bfed30b
3809d3a
bd5006d
4e2452c
7d90975
05a0d7a
2beddc0
cb17f2c
10f20ee
2dfb01b
18ee55d
814e4dd
f106599
cc97d43
26961c1
26b77f6
01bacb8
4a82762
fcac8d4
a867b9e
3d29d9a
a939b4c
c37bb83
7637420
7d35c53
0dbff61
cdcf558
21331f5
b843098
a273499
ea3d4c0
0c25068
2a91c1f
9f02b87
c55c330
15da9d8
49c555d
47cc2a6
58db910
739cb02
e53d0db
85c6547
a53914f
3fa3de7
f93731a
98647f7
84e2800
043fcff
20dfd78
469cbb4
4fbae98
50b3096
f2be348
ec84d37
369b7bf
97260b8
559d179
2352114
c35f9b3
4512b31
f75ae92
b3e2e8a
cdf0952
e1d080c
581ef5e
8889362
30146a5
6908c87
256abb2
59b4224
a906f3f
564f330
dc48070
e63086b
a28474c
d3a95ab
ee2852c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,94 @@ | ||
| package io.odpf.dagger.common.serde.parquet; | ||
|
|
||
| import org.apache.parquet.example.data.simple.SimpleGroup; | ||
| import org.apache.parquet.schema.GroupType; | ||
| import org.apache.parquet.schema.LogicalTypeAnnotation; | ||
| import org.apache.parquet.schema.Type; | ||
|
|
||
| import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; | ||
| import static org.apache.parquet.schema.Type.Repetition.REPEATED; | ||
| import static org.apache.parquet.schema.Type.Repetition.REQUIRED; | ||
|
|
||
| public class SimpleGroupValidation { | ||
| public static boolean checkFieldExistsAndIsInitialized(SimpleGroup simpleGroup, String fieldName) { | ||
| return simpleGroup.getType().containsField(fieldName) && simpleGroup.getFieldRepetitionCount(fieldName) != 0; | ||
| } | ||
|
|
||
| /** | ||
| * This method checks if the map field inside the simple group is | ||
| * serialized using this legacy format: | ||
| * {@code | ||
| * <pre> | ||
| * repeated group <name> { | ||
| * <repetition-type> <data-type> key; | ||
| * <repetition-type> <data-type> value; | ||
| * } | ||
| * </pre> | ||
| * } | ||
| * The outer group is always repeated. key and value are constant field names. | ||
| * | ||
| * @param simpleGroup The SimpleGroup object inside which the map field is present | ||
| * @param fieldName The name of the map field | ||
| * @return true, if the map structure follows the spec and false otherwise. | ||
| */ | ||
| public static boolean checkIsLegacySimpleGroupMap(SimpleGroup simpleGroup, String fieldName) { | ||
| if (!(simpleGroup.getType().getType(fieldName) instanceof GroupType)) { | ||
| return false; | ||
| } | ||
| GroupType nestedMapGroupType = simpleGroup.getType().getType(fieldName).asGroupType(); | ||
| return nestedMapGroupType.isRepetition(Type.Repetition.REPEATED) | ||
| && nestedMapGroupType.getFieldCount() == 2 | ||
| && nestedMapGroupType.containsField("key") | ||
| && nestedMapGroupType.containsField("value"); | ||
| } | ||
|
|
||
| /** | ||
| * This method checks if the map field inside the simple group is | ||
| * serialized using this standard parquet map specification: | ||
| * {@code | ||
| * <pre> | ||
| * <repetition-type> group <name> (MAP) { | ||
| * repeated group key_value { | ||
| * required <data-type> key; | ||
| * <repetition-type> <data-type> value; | ||
| * } | ||
| * } | ||
| * </pre> | ||
| * } | ||
| * The validation checks below follow the <a href="https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps">Apache Parquet LogicalTypes Specification</a>for Maps. | ||
| * | ||
| * @param simpleGroup The SimpleGroup object inside which the map field is present | ||
| * @param fieldName The name of the map field | ||
| * @return true, if the map structure follows the spec and false otherwise. | ||
| */ | ||
| public static boolean checkIsStandardSimpleGroupMap(SimpleGroup simpleGroup, String fieldName) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this looks more like a method in mapHandler rather than a static method here. Any particular rationale for not keeping it there?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed, we can keep this for now |
||
| return applyMapFieldValidations(simpleGroup, fieldName) | ||
| && applyNestedKeyValueFieldValidations(simpleGroup, fieldName); | ||
| } | ||
|
|
||
| private static boolean applyMapFieldValidations(SimpleGroup simpleGroup, String fieldName) { | ||
| Type mapType = simpleGroup.getType().getType(fieldName); | ||
| if (mapType instanceof GroupType) { | ||
| GroupType mapGroupType = mapType.asGroupType(); | ||
| return (mapGroupType.getRepetition().equals(OPTIONAL) | ||
| || mapGroupType.isRepetition(REQUIRED)) | ||
| && mapGroupType.getLogicalTypeAnnotation().equals(LogicalTypeAnnotation.mapType()) | ||
| && mapGroupType.getFieldCount() == 1; | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| private static boolean applyNestedKeyValueFieldValidations(SimpleGroup simpleGroup, String fieldName) { | ||
| GroupType mapGroupType = simpleGroup.getType().getType(fieldName).asGroupType(); | ||
| if (mapGroupType.containsField("key_value")) { | ||
| Type nestedKeyValueType = mapGroupType.getType("key_value"); | ||
| if (nestedKeyValueType instanceof GroupType) { | ||
| GroupType nestedKeyValueGroupType = nestedKeyValueType.asGroupType(); | ||
| return nestedKeyValueGroupType.isRepetition(REPEATED) | ||
| && nestedKeyValueGroupType.containsField("key") | ||
| && nestedKeyValueGroupType.getType("key").isRepetition(REQUIRED); | ||
| } | ||
| } | ||
| return false; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this looks more like a method in mapHandler rather than a static method here. Any particular rationale for not keeping it there?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are mainly 2 reasons:
ifbased checks and hence this needs to be tested for different corner cases. The unit tests are verbose. Here it is a public function and can be tested as it is, but when placed in MapHandler, this would ideally become a private function and hence the entire unit would need to be tested through the public interface oftransformFromParquet. This method already has a lot of big tests right now and moving this validator function to MapHandler would result in a lot of hard to understand tests. The MapHandler class is already quite big and I don't want to make it bigger.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed, let's keep this for now