Hi again, given the list of fields considered to identify the duplicates in the Locality table, here is my query to list them all. The collection I was working on did not use any joined table such as locality details so please include that as well if it applies to your case.
SELECT GROUP_CONCAT(localityid ORDER BY localityid) AS duplicate_ids, Datum, ElevationAccuracy, ElevationMethod, GML, Lat1Text, Lat2Text, LatLongAccuracy, LatLongMethod, LatLongType, Latitude1, Latitude2, LocalityName, Long1Text, Long2Text, Longitude1, Longitude2, MaxElevation, MinElevation, NamedPlace, OriginalElevationUnit, OriginalLatLongUnit, RelationToNamedPlace, Remarks, SGRStatus, ShortName, SrcLatLongUnit, Text1, Text2, Text3, Text4, Text5, UniqueIdentifier, VerbatimElevation, VerbatimLatitude, VerbatimLongitude, Visibility, YesNo1, YesNo2, YesNo3, YesNo4, YesNo5, DisciplineID, GeographyID, PaleoContextID, VisibilitySetByID
FROM locality
GROUP BY Datum, ElevationAccuracy, ElevationMethod, GML, Lat1Text, Lat2Text, LatLongAccuracy, LatLongMethod, LatLongType, Latitude1, Latitude2, LocalityName, Long1Text, Long2Text, Longitude1, Longitude2, MaxElevation, MinElevation, NamedPlace, OriginalElevationUnit, OriginalLatLongUnit, RelationToNamedPlace, Remarks, SGRStatus, ShortName, SrcLatLongUnit, Text1, Text2, Text3, Text4, Text5, UniqueIdentifier, VerbatimElevation, VerbatimLatitude, VerbatimLongitude, Visibility, YesNo1, YesNo2, YesNo3, YesNo4, YesNo5, DisciplineID, GeographyID, PaleoContextID, VisibilitySetByID
HAVING COUNT(*) > 1;
Then, based on this group_concat (given the list of fields considered to identify the duplicates), I’ve created a procedure that (1) reassigns all related collecting events to the first locality in each group of duplicates, and (2) deletes the remaining localities that are no longer referenced.
DROP PROCEDURE IF EXISTS merge_locality_duplicates;
DELIMITER $$
CREATE PROCEDURE merge_locality_duplicates()
BEGIN
DECLARE done INT DEFAULT FALSE;
DECLARE id_list TEXT;
DECLARE first_id INT;
-- Cursor to loop through duplicate locality groups
DECLARE cur CURSOR FOR
SELECT GROUP_CONCAT(localityid ORDER BY localityid) AS duplicate_ids
FROM locality
GROUP BY Datum, ElevationAccuracy, ElevationMethod, GML, Lat1Text, Lat2Text,
LatLongAccuracy, LatLongMethod, LatLongType, Latitude1, Latitude2,
LocalityName, Long1Text, Long2Text, Longitude1, Longitude2,
MaxElevation, MinElevation, NamedPlace, OriginalElevationUnit,
OriginalLatLongUnit, RelationToNamedPlace, Remarks, SGRStatus,
ShortName, SrcLatLongUnit, Text1, Text2, Text3, Text4, Text5,
UniqueIdentifier, VerbatimElevation, VerbatimLatitude,
VerbatimLongitude, Visibility, YesNo1, YesNo2, YesNo3, YesNo4,
YesNo5, DisciplineID, GeographyID, PaleoContextID, VisibilitySetByID
HAVING COUNT(*) > 1;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
-- Step 1: Update collectingevent.localityid to point to the first ID in each group
OPEN cur;
read_loop: LOOP
FETCH cur INTO id_list;
IF done THEN
LEAVE read_loop;
END IF;
SET first_id = SUBSTRING_INDEX(id_list, ',', 1);
-- Replace all localityids in collectingevent with the first_id of the group
UPDATE collectingevent
SET localityid = first_id
WHERE FIND_IN_SET(localityid, id_list) > 0
AND localityid != first_id;
-- Step 2: Delete all duplicate locality IDs except the first one
DELETE FROM locality
WHERE FIND_IN_SET(localityid, id_list) > 0
AND localityid != first_id;
END LOOP;
CLOSE cur;
END$$
DELIMITER ;
CALL merge_locality_duplicates();